In [1]:
# Set the model name
model_name='all-MiniLM-L12-v2'

# Preprocessing

In [2]:
# Functions and classes for preprocessing the data
from itertools import chain
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import re
import os
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
class Document:
  def __init__(self, doc_no, doc_text, tokens):
    self.doc_no = doc_no
    self.doc_text = doc_text
    self.tokens = tokens

  def __str__(self):
    return 'Document Number: ' + self.doc_no + '\nDocument Text: ' + self.doc_text + '\nTokens: ' + str(self.tokens) + '\n'

  def to_dict(self):
    return {'docno': self.doc_no, 'doctext': self.doc_text, 'tokens': self.tokens, 'text': ' '.join(self.tokens)}

In [4]:
# Get the stop words
def get_stop_words():
  stopwords = set()
  # Open the stop words and add them to the set
  with open('StopWords.txt', 'r') as file:
    for line in file:
      stopwords.add(line.strip())
  return stopwords


# load the stopwords
stop_words = get_stop_words()

# function to perform preprocessing on the text
def preprocess(file):
  with open(file, "r") as f:
    content = f.read()
  documents = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
  preprocessed_documents = []
  for document in documents:
    # Get the document number and text
    raw_no = re.search(r'<DOCNO>(.*?)</DOCNO>', document, re.DOTALL)
    doc_no = raw_no.group(1) if raw_no else ''
    raw_text = re.search(r'<TEXT>(.*?)</TEXT>', document, re.DOTALL)
    doc_text = raw_text.string if raw_text else ''

    # create a document object
    doc = Document(doc_no, doc_text, [])
    preprocessed_documents.append(doc)
  return preprocessed_documents

# function to preprocess a single text string
def preprocess_text(text: str, stem=True, stopwords=True):
    # lowercase the text
  text = text.lower()

  # tokenize the text
  tokens = word_tokenize(text)
  # remove stopwords
  if stopwords:
    tokens = [token for token in tokens if token not in stop_words]
  # stem the tokens
  if stem:
    # apply the porter stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
  # remove punctuation
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  stripped = [w.translate(table) for w in tokens]
  stripped = list(chain(*[w.split() for w in stripped]))

  # remove empty tokens, stopwords (if applicable) and non-alphabetic tokens
  stripped = [
      token for token in stripped if token and (token not in stop_words if stopwords else True) and token.isalpha()]
  return stripped

# main function to preprocess a directory of text files
def preprocess_directory(directory, num_files=-1):
  preprocessed_documents = []
  ctr = 0
  for filename in os.listdir(directory):
    print('Preprocessing file: ', filename)
    file = os.path.join(directory, filename)
    preprocessed_documents.extend(preprocess(file))
    ctr += 1
    if ctr == num_files and num_files != -1:
      break
    
  print('preprocessed ', ctr, ' files')
  return preprocessed_documents

In [5]:
# Preprocess the collection
preprocessed_documents = preprocess_directory('AP_collection/coll')

Preprocessing file:  AP880212
Preprocessing file:  AP880213
Preprocessing file:  AP880214
Preprocessing file:  AP880215
Preprocessing file:  AP880216
Preprocessing file:  AP880217
Preprocessing file:  AP880218
Preprocessing file:  AP880219
Preprocessing file:  AP880220
Preprocessing file:  AP880221
Preprocessing file:  AP880222
Preprocessing file:  AP880223
Preprocessing file:  AP880224
Preprocessing file:  AP880225
Preprocessing file:  AP880226
Preprocessing file:  AP880227
Preprocessing file:  AP880228
Preprocessing file:  AP880229
Preprocessing file:  AP880301
Preprocessing file:  AP880302
Preprocessing file:  AP880303
Preprocessing file:  AP880304
Preprocessing file:  AP880307
Preprocessing file:  AP880308
Preprocessing file:  AP880309
Preprocessing file:  AP880310
Preprocessing file:  AP880311
Preprocessing file:  AP880312
Preprocessing file:  AP880313
Preprocessing file:  AP880314
Preprocessing file:  AP880315
Preprocessing file:  AP880316
Preprocessing file:  AP880317
Preprocess

In [6]:
len(preprocessed_documents)

79923

In [7]:
# function to extract the topics from the topics file
def extract_topics(file, descriptions=False):
  with open(file, "r") as f:
    topic_content = f.read()
  all_topics = []
  topics = re.findall(r'<top>(.*?)</top>', topic_content, re.DOTALL)
  for topic in topics:
    raw_title = re.search(r'<title>(.*?)\n\n', topic, re.DOTALL)
    title = raw_title.group(1) if raw_title else ''
    if descriptions:
      raw_desc = re.search(r'<desc>(.*?)\n\n', topic, re.DOTALL)
      desc = raw_desc.group(1) if raw_desc else ''
      all_topics.append({'title': title, 'description': desc})
    else:
      all_topics.append({'title': title})
  return all_topics

# Sentence transformer

In [8]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
NVIDIA GeForce GTX 1660 Ti


In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(f'sentence-transformers/{model_name}', device='cuda:0')

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)5dded/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 291kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 47.5kB/s]
Downloading (…)4d81d5dded/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 2.11MB/s]
Downloading (…)81d5dded/config.json: 100%|██████████| 573/573 [00:00<00:00, 126kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 30.9kB/s]
Downloading (…)ded/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 3.92MB/s]
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:02<00:00, 51.3MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 14.7kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 32.6kB/s]
Downloading (…)5dded/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 12.3MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 352/352 [00:00<00:00, 72.

In [10]:
import scipy

def search(query, model, preprocessed_documents, doc_embeddings, top_k=20):
  query_embeddings = model.encode([query])
  # compute distances
  distances = scipy.spatial.distance.cdist(query_embeddings, doc_embeddings, "cosine")[0]
  # get the top k results
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])
  # Create a list of tuples with the document number and the distance
  results = [(preprocessed_documents[idx].doc_no, distance) for idx, distance in results[0:top_k]]
  return results

In [20]:

import pickle
import gzip
import numpy as np

# Compute the embeddings
doc_embeddings = []
for x, doc in enumerate(preprocessed_documents):
  # Clear the cache
  torch.cuda.empty_cache()
  # Calculate embedding for each document
  print(f'Embedding {doc.doc_no.strip()} {x}/{len(preprocessed_documents)}...')
  doc_embed = model.encode(doc.doc_text, show_progress_bar=False)
  # write the document embedding to a file
  os.makedirs(f'embedding_saves/{model_name}', exist_ok=True)
  with open(f'embedding_saves/{model_name}/{doc.doc_no.strip()}.pickle', 'wb') as f:
    pickle.dump(doc_embed, f)
  doc_embeddings.append(doc_embed)

# Save the embeddings
doc_embeddings = np.array(doc_embeddings)

# store the embeddings in a pickle file
with open(f"embedding_saves/{model_name}.pickle", 'wb') as f:
  pickle.dump(doc_embeddings, f)
# gzip the pickle file
with open(f"embedding_saves/{model_name}.pickle", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.pickle.gz", 'wb') as f_out:
  f_out.writelines(f_in)

Embedding AP880212-0001 0/79923...
Embedding AP880212-0002 1/79923...
Embedding AP880212-0003 2/79923...
Embedding AP880212-0004 3/79923...
Embedding AP880212-0005 4/79923...
Embedding AP880212-0006 5/79923...
Embedding AP880212-0007 6/79923...
Embedding AP880212-0008 7/79923...
Embedding AP880212-0009 8/79923...
Embedding AP880212-0010 9/79923...
Embedding AP880212-0011 10/79923...
Embedding AP880212-0012 11/79923...
Embedding AP880212-0013 12/79923...
Embedding AP880212-0014 13/79923...
Embedding AP880212-0015 14/79923...
Embedding AP880212-0016 15/79923...
Embedding AP880212-0017 16/79923...
Embedding AP880212-0018 17/79923...
Embedding AP880212-0019 18/79923...
Embedding AP880212-0020 19/79923...
Embedding AP880212-0021 20/79923...
Embedding AP880212-0022 21/79923...
Embedding AP880212-0023 22/79923...
Embedding AP880212-0024 23/79923...
Embedding AP880212-0025 24/79923...
Embedding AP880212-0026 25/79923...
Embedding AP880212-0027 26/79923...
Embedding AP880212-0028 27/79923...
Em

KeyboardInterrupt: 

# Compressed CSV

In [12]:
# import csv
# import gzip
# import os

# # assuming you have a list of Document objects called documents
# # and assuming you have already populated the vector attribute of each Document object

# # define the headers for your CSV file
# headers = ['doc_no', 'vector']

# # open the CSV file in 'w' mode and write the headers
# with open(f"embedding_saves/{model_name}.csv", mode='w', newline='') as file:
#   writer = csv.writer(file)
#   writer.writerow(headers)

#   # loop through each Document object and write its attributes to the CSV file
#   for x, document in enumerate(preprocessed_documents):
#     writer.writerow([document.doc_no, doc_embeddings[x]])

# # gzip the CSV file
# with open(f"embedding_saves/{model_name}.csv", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.csv.gz", 'wb') as f_out:
#     f_out.writelines(f_in)

# os.remove(f"embedding_saves/{model_name}.csv")

# Retrieval

In [13]:
# Go through all the documents and search for the top 1000 results
def query_retrieve(model, preprocessed_documents, doc_embeddings, descriptions=False, runid='runid', filename='Results.txt', top_k=1000):
  # Extract the topics
  topics = extract_topics('topics1-50.txt', descriptions)

  file_out = open(filename, 'w')

  for i, topic in enumerate(topics):
    # Search for the documents
    results = search(topic['title'], model, preprocessed_documents, doc_embeddings, top_k)
    for j, (doc_id, distance) in enumerate(results):
      file_out.write(f'{i+1} Q0 {doc_id.strip()} {j+1} {1-distance} {runid}\n')
  file_out.close()
  print('Written results to file ', filename)

In [14]:
query_retrieve(model, preprocessed_documents, doc_embeddings, descriptions=False, runid='runid', filename=f'Results-{model_name}.txt', top_k=1000)

Written results to file  Results-all-MiniLM-L12-v2.txt


In [15]:
!Powershell.exe -Command ".\trec_eval qrels1-50ap.txt Results-{model_name}.txt"

runid                 	all	runid
num_q                 	all	50
num_ret               	all	50000
num_rel               	all	2099
num_rel_ret           	all	1144
map                   	all	0.1705
gm_map                	all	0.0644
Rprec                 	all	0.2044
bpref                 	all	0.2731
recip_rank            	all	0.5510
iprec_at_recall_0.00  	all	0.5854
iprec_at_recall_0.10  	all	0.3964
iprec_at_recall_0.20  	all	0.3209
iprec_at_recall_0.30  	all	0.2502
iprec_at_recall_0.40  	all	0.1879
iprec_at_recall_0.50  	all	0.1266
iprec_at_recall_0.60  	all	0.0794
iprec_at_recall_0.70  	all	0.0611
iprec_at_recall_0.80  	all	0.0429
iprec_at_recall_0.90  	all	0.0341
iprec_at_recall_1.00  	all	0.0202
P_5                   	all	0.3360
P_10                  	all	0.3040
P_15                  	all	0.2840
P_20                  	all	0.2520
P_30                  	all	0.2293
P_100                 	all	0.1248
P_200                 	all	0.0778
P_500                 	all	0.0391
P_1000                	a

In [16]:
from transformers import file_utils
print(file_utils.default_cache_path)

C:\Users\Howard/.cache\huggingface\hub
