# Preprocessing

In [2]:
# Functions and classes for preprocessing the data
import re
import os

In [3]:
class Document:
  def __init__(self, doc_no, doc_text, tokens):
    self.doc_no = doc_no
    self.doc_text = doc_text
    self.tokens = tokens

  def __str__(self):
    return 'Document Number: ' + self.doc_no + '\nDocument Text: ' + self.doc_text + '\nTokens: ' + str(self.tokens) + '\n'

  def to_dict(self):
    return {'docno': self.doc_no, 'doctext': self.doc_text, 'tokens': self.tokens, 'text': ' '.join(self.tokens)}

In [4]:
# function to perform preprocessing on the text
def preprocess(file):
  with open(file, "r") as f:
    content = f.read()
  documents = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
  preprocessed_documents = []
  for document in documents:
    # Get the document number and text
    raw_no = re.search(r'<DOCNO>(.*?)</DOCNO>', document, re.DOTALL)
    doc_no = raw_no.group(1) if raw_no else ''
    raw_text = re.search(r'<TEXT>(.*?)</TEXT>', document, re.DOTALL)
    doc_text = raw_text.string if raw_text else ''

    # create a document object
    doc = Document(doc_no, doc_text, [])
    preprocessed_documents.append(doc)
  return preprocessed_documents

# main function to preprocess a directory of text files
def preprocess_directory(directory, num_files=-1):
  preprocessed_documents = []
  ctr = 0
  for filename in os.listdir(directory):
    print('Preprocessing file: ', filename)
    file = os.path.join(directory, filename)
    preprocessed_documents.extend(preprocess(file))
    ctr += 1
    if ctr == num_files and num_files != -1:
      break
    
  print('preprocessed ', ctr, ' files')
  return preprocessed_documents

In [5]:
# Preprocess the collection
preprocessed_documents = preprocess_directory('AP_collection/coll')
preprocessed_documents.sort(key=lambda x: x.doc_no)

Preprocessing file:  AP880212
Preprocessing file:  AP880213
Preprocessing file:  AP880214
Preprocessing file:  AP880215
Preprocessing file:  AP880216
Preprocessing file:  AP880217
Preprocessing file:  AP880218
Preprocessing file:  AP880219
Preprocessing file:  AP880220
Preprocessing file:  AP880221
Preprocessing file:  AP880222
Preprocessing file:  AP880223
Preprocessing file:  AP880224
Preprocessing file:  AP880225
Preprocessing file:  AP880226
Preprocessing file:  AP880227
Preprocessing file:  AP880228
Preprocessing file:  AP880229
Preprocessing file:  AP880301
Preprocessing file:  AP880302
Preprocessing file:  AP880303
Preprocessing file:  AP880304
Preprocessing file:  AP880307
Preprocessing file:  AP880308
Preprocessing file:  AP880309
Preprocessing file:  AP880310
Preprocessing file:  AP880311
Preprocessing file:  AP880312
Preprocessing file:  AP880313
Preprocessing file:  AP880314
Preprocessing file:  AP880315
Preprocessing file:  AP880316
Preprocessing file:  AP880317
Preprocess

In [6]:
len(preprocessed_documents)

79923

In [7]:
# function to extract the topics from the topics file
def extract_topics(file, descriptions=False):
  with open(file, "r") as f:
    topic_content = f.read()
  all_topics = []
  topics = re.findall(r'<top>(.*?)</top>', topic_content, re.DOTALL)
  for topic in topics:
    raw_title = re.search(r'<title>(.*?)\n\n', topic, re.DOTALL)
    title = raw_title.group(1) if raw_title else ''
    if descriptions:
      raw_desc = re.search(r'<desc>(.*?)\n\n', topic, re.DOTALL)
      desc = raw_desc.group(1) if raw_desc else ''
      all_topics.append({'title': title, 'description': desc})
    else:
      all_topics.append({'title': title})
  return all_topics

# Sentence transformer

In [8]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
NVIDIA GeForce GTX 1660 Ti


In [10]:
from sentence_transformers import SentenceTransformer
# Set the model name
model_name='SGPT-125M-weightedmean-nli-bitfit'
model = SentenceTransformer(f'Muennighoff/{model_name}', device='cpu', cache_folder='./.cache')

In [11]:
import scipy

def search(n, query, model, preprocessed_documents, doc_embeddings, top_k=20):
  # Fetch the embeddings for the query if it exists, otherwise compute it
  if os.path.exists(f'embedding_saves/{model_name}/query-{n}.pickle'):
    query_embeddings = torch.load(f'embedding_saves/{model_name}/query-{n}.pickle')
  else:
    query_embeddings = model.encode([query])
    os.makedirs(f'embedding_saves/{model_name}', exist_ok=True)
    torch.save(query_embeddings, f'embedding_saves/{model_name}/query-{n}.pickle')
  # compute distances
  distances = scipy.spatial.distance.cdist(query_embeddings, doc_embeddings, "cosine")[0]
  # get the top k results
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])
  # Create a list of tuples with the document number and the distance
  results = [(preprocessed_documents[idx].doc_no, distance) for idx, distance in results[0:top_k]]
  return results

In [12]:
import pickle

# Compute the embeddings
for x, doc in enumerate(reversed(preprocessed_documents)):
  # Clear the cache
  torch.cuda.empty_cache()
  # skip the document if it exists, otherwise compute it
  if os.path.exists(f'embedding_saves/{model_name}/{doc.doc_no.strip()}.pickle'):
    continue
  else:
    os.makedirs(f'embedding_saves/{model_name}', exist_ok=True)
    # Calculate embedding for each document
    print(f'Embedding {doc.doc_no.strip()} {x}/{len(preprocessed_documents)}...')
    doc_embed = model.encode(doc.doc_text, show_progress_bar=False)
    # write the document embedding to a file
    with open(f'embedding_saves/{model_name}/{doc.doc_no.strip()}.pickle', 'wb') as f:
      pickle.dump(doc_embed, f)
print(f'Finished computing embeddings for {model_name}')

Embedding AP881231-0146 0/79923...
Embedding AP881231-0145 1/79923...
Embedding AP881231-0144 2/79923...
Embedding AP881231-0143 3/79923...
Embedding AP881231-0142 4/79923...
Embedding AP881231-0141 5/79923...
Embedding AP881231-0140 6/79923...
Embedding AP881231-0139 7/79923...
Embedding AP881231-0138 8/79923...
Embedding AP881231-0137 9/79923...
Embedding AP881231-0136 10/79923...
Embedding AP881231-0135 11/79923...
Embedding AP881231-0134 12/79923...
Embedding AP881231-0133 13/79923...
Embedding AP881231-0132 14/79923...
Embedding AP881231-0131 15/79923...
Embedding AP881231-0130 16/79923...
Embedding AP881231-0129 17/79923...
Embedding AP881231-0128 18/79923...
Embedding AP881231-0127 19/79923...
Embedding AP881231-0126 20/79923...
Embedding AP881231-0125 21/79923...
Embedding AP881231-0124 22/79923...
Embedding AP881231-0123 23/79923...
Embedding AP881231-0122 24/79923...
Embedding AP881231-0121 25/79923...
Embedding AP881231-0120 26/79923...
Embedding AP881231-0119 27/79923...
Em

KeyboardInterrupt: 

# Compressed CSV

In [None]:
# import csv
# import gzip
# import os

# # assuming you have a list of Document objects called documents
# # and assuming you have already populated the vector attribute of each Document object

# # define the headers for your CSV file
# headers = ['doc_no', 'vector']

# # open the CSV file in 'w' mode and write the headers
# with open(f"embedding_saves/{model_name}.csv", mode='w', newline='') as file:
#   writer = csv.writer(file)
#   writer.writerow(headers)

#   # loop through each Document object and write its attributes to the CSV file
#   for x, document in enumerate(preprocessed_documents):
#     writer.writerow([document.doc_no, doc_embeddings[x]])

# # gzip the CSV file
# with open(f"embedding_saves/{model_name}.csv", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.csv.gz", 'wb') as f_out:
#     f_out.writelines(f_in)

# os.remove(f"embedding_saves/{model_name}.csv")

# Retrieval

In [19]:
# Read all the embeddings from the files in the directory
doc_embeddings = []
for x, doc in enumerate(preprocessed_documents):
  if x % 1000 == 0:
    print(f'{x}/{len(preprocessed_documents)}')
  filename = doc.doc_no.strip()
  if os.path.exists(f'embedding_saves/{model_name}/{filename}.pickle'):
    # print(f'Loading embedding for {model_name}/{filename} {x}/{len(preprocessed_documents)}')
    with open(f'embedding_saves/{model_name}/{filename}.pickle', 'rb') as f:
      doc_embeddings.append(pickle.load(f))
  else:
    print(f'Embedding for {model_name}/{filename} doesn\'t exist {x}/{len(preprocessed_documents)}')

0/79923
1000/79923
2000/79923
3000/79923
4000/79923
5000/79923
6000/79923
7000/79923
8000/79923
9000/79923
10000/79923
11000/79923
12000/79923
13000/79923
14000/79923
15000/79923
16000/79923
17000/79923
18000/79923
19000/79923
20000/79923
21000/79923
22000/79923
23000/79923
24000/79923
25000/79923
26000/79923
27000/79923
28000/79923
29000/79923
30000/79923
31000/79923
32000/79923
33000/79923
34000/79923
35000/79923
36000/79923
37000/79923
38000/79923
39000/79923
40000/79923
41000/79923
42000/79923
43000/79923
44000/79923
45000/79923
46000/79923
47000/79923
48000/79923
49000/79923
50000/79923
51000/79923
52000/79923
53000/79923
54000/79923
55000/79923
56000/79923
57000/79923
58000/79923
59000/79923
60000/79923
61000/79923
62000/79923
63000/79923
64000/79923
65000/79923
66000/79923
67000/79923
68000/79923
69000/79923
70000/79923
71000/79923
72000/79923
73000/79923
74000/79923
75000/79923
76000/79923
77000/79923
78000/79923
79000/79923


In [20]:
import numpy as np
# Save the embeddings
doc_embeddings = np.array(doc_embeddings)

In [14]:
import gzip

print(f'Embeddings computed, saving to file: embedding_saves/{model_name}.pickle.gz')
# store the embeddings in a pickle file
with open(f"embedding_saves/{model_name}.pickle", 'wb') as f:
  pickle.dump(np.array(doc_embeddings), f)
# gzip the pickle file
with open(f"embedding_saves/{model_name}.pickle", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.pickle.gz", 'wb') as f_out:
  f_out.writelines(f_in)

Embeddings computed, saving to file: embedding_saves/gtr-t5-xxl.pickle.gz


In [21]:
# Go through all the documents and search for the top 1000 results
def query_retrieve(model, preprocessed_documents, doc_embeddings, descriptions=False, runid='runid', filename='Results.txt', top_k=1000):
  # Extract the topics
  topics = extract_topics('topics1-50.txt', descriptions)

  file_out = open(filename, 'w')

  for i, topic in enumerate(topics):
    print(f'Querying for topic {i+1}...')
    # Search for the documents
    results = search(i, topic['title'], model, preprocessed_documents, doc_embeddings, top_k)
    for j, (doc_id, distance) in enumerate(results):
      file_out.write(f'{i+1} Q0 {doc_id.strip()} {j+1} {1-distance} {runid}\n')
  file_out.close()
  print('Written results to file ', filename)

In [22]:
query_retrieve(model, preprocessed_documents, doc_embeddings, descriptions=False, runid='runid', filename=f'Results-{model_name}.txt', top_k=1000)

Querying for topic 1...
Querying for topic 2...
Querying for topic 3...
Querying for topic 4...
Querying for topic 5...
Querying for topic 6...
Querying for topic 7...
Querying for topic 8...
Querying for topic 9...
Querying for topic 10...
Querying for topic 11...
Querying for topic 12...
Querying for topic 13...
Querying for topic 14...
Querying for topic 15...
Querying for topic 16...
Querying for topic 17...
Querying for topic 18...
Querying for topic 19...
Querying for topic 20...
Querying for topic 21...
Querying for topic 22...
Querying for topic 23...
Querying for topic 24...
Querying for topic 25...
Querying for topic 26...
Querying for topic 27...
Querying for topic 28...
Querying for topic 29...
Querying for topic 30...
Querying for topic 31...
Querying for topic 32...
Querying for topic 33...
Querying for topic 34...
Querying for topic 35...
Querying for topic 36...
Querying for topic 37...
Querying for topic 38...
Querying for topic 39...
Querying for topic 40...
Querying 

In [23]:
!Powershell.exe -Command ".\trec_eval qrels1-50ap.txt Results-{model_name}.txt"

runid                 	all	runid
num_q                 	all	50
num_ret               	all	50000
num_rel               	all	2099
num_rel_ret           	all	1448
map                   	all	0.3039
gm_map                	all	0.1876
Rprec                 	all	0.3178
bpref                 	all	0.3828
recip_rank            	all	0.7410
iprec_at_recall_0.00  	all	0.7757
iprec_at_recall_0.10  	all	0.6300
iprec_at_recall_0.20  	all	0.5332
iprec_at_recall_0.30  	all	0.4262
iprec_at_recall_0.40  	all	0.3662
iprec_at_recall_0.50  	all	0.3129
iprec_at_recall_0.60  	all	0.2194
iprec_at_recall_0.70  	all	0.1435
iprec_at_recall_0.80  	all	0.0845
iprec_at_recall_0.90  	all	0.0438
iprec_at_recall_1.00  	all	0.0366
P_5                   	all	0.4920
P_10                  	all	0.4420
P_15                  	all	0.4107
P_20                  	all	0.3860
P_30                  	all	0.3320
P_100                 	all	0.1698
P_200                 	all	0.1019
P_500                 	all	0.0506
P_1000                	a

In [None]:
from transformers import file_utils
print(file_utils.default_cache_path)

C:\Users\Howard/.cache\huggingface\hub
