In [1]:
# Set the model name
model_name='gtr-t5-xxl'

# Preprocessing

In [2]:
# Functions and classes for preprocessing the data
import re
import os

In [3]:
class Document:
  def __init__(self, doc_no, doc_text, tokens):
    self.doc_no = doc_no
    self.doc_text = doc_text
    self.tokens = tokens

  def __str__(self):
    return 'Document Number: ' + self.doc_no + '\nDocument Text: ' + self.doc_text + '\nTokens: ' + str(self.tokens) + '\n'

  def to_dict(self):
    return {'docno': self.doc_no, 'doctext': self.doc_text, 'tokens': self.tokens, 'text': ' '.join(self.tokens)}

In [4]:
# function to perform preprocessing on the text
def preprocess(file):
  with open(file, "r") as f:
    content = f.read()
  documents = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
  preprocessed_documents = []
  for document in documents:
    # Get the document number and text
    raw_no = re.search(r'<DOCNO>(.*?)</DOCNO>', document, re.DOTALL)
    doc_no = raw_no.group(1) if raw_no else ''
    raw_text = re.search(r'<TEXT>(.*?)</TEXT>', document, re.DOTALL)
    doc_text = raw_text.string if raw_text else ''

    # create a document object
    doc = Document(doc_no, doc_text, [])
    preprocessed_documents.append(doc)
  return preprocessed_documents

# main function to preprocess a directory of text files
def preprocess_directory(directory, num_files=-1):
  preprocessed_documents = []
  ctr = 0
  for filename in os.listdir(directory):
    print('Preprocessing file: ', filename)
    file = os.path.join(directory, filename)
    preprocessed_documents.extend(preprocess(file))
    ctr += 1
    if ctr == num_files and num_files != -1:
      break
    
  print('preprocessed ', ctr, ' files')
  return preprocessed_documents

In [5]:
# Preprocess the collection
preprocessed_documents = preprocess_directory('AP_collection/coll')

Preprocessing file:  AP880212
Preprocessing file:  AP880213
Preprocessing file:  AP880214
Preprocessing file:  AP880215
Preprocessing file:  AP880216
Preprocessing file:  AP880217
Preprocessing file:  AP880218
Preprocessing file:  AP880219
Preprocessing file:  AP880220
Preprocessing file:  AP880221
Preprocessing file:  AP880222
Preprocessing file:  AP880223
Preprocessing file:  AP880224
Preprocessing file:  AP880225
Preprocessing file:  AP880226
Preprocessing file:  AP880227
Preprocessing file:  AP880228
Preprocessing file:  AP880229
Preprocessing file:  AP880301
Preprocessing file:  AP880302
Preprocessing file:  AP880303
Preprocessing file:  AP880304
Preprocessing file:  AP880307
Preprocessing file:  AP880308
Preprocessing file:  AP880309
Preprocessing file:  AP880310
Preprocessing file:  AP880311
Preprocessing file:  AP880312
Preprocessing file:  AP880313
Preprocessing file:  AP880314
Preprocessing file:  AP880315
Preprocessing file:  AP880316
Preprocessing file:  AP880317
Preprocess

In [6]:
len(preprocessed_documents)

79923

In [7]:
# function to extract the topics from the topics file
def extract_topics(file, descriptions=False):
  with open(file, "r") as f:
    topic_content = f.read()
  all_topics = []
  topics = re.findall(r'<top>(.*?)</top>', topic_content, re.DOTALL)
  for topic in topics:
    raw_title = re.search(r'<title>(.*?)\n\n', topic, re.DOTALL)
    title = raw_title.group(1) if raw_title else ''
    if descriptions:
      raw_desc = re.search(r'<desc>(.*?)\n\n', topic, re.DOTALL)
      desc = raw_desc.group(1) if raw_desc else ''
      all_topics.append({'title': title, 'description': desc})
    else:
      all_topics.append({'title': title})
  return all_topics

# Sentence transformer

In [8]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
NVIDIA GeForce GTX 1660 Ti


In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(f'sentence-transformers/{model_name}', device='cpu', cache_folder='./.cache')

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import scipy

def search(n, query, model, preprocessed_documents, doc_embeddings, top_k=20):
  # Fetch the embeddings for the query if it exists, otherwise compute it
  if os.path.exists(f'embedding_saves/{model_name}/query-{n}.pickle'):
    query_embeddings = torch.load(f'embedding_saves/{model_name}/query-{n}.pickle')
  else:
    query_embeddings = model.encode([query])
    os.makedirs(f'embedding_saves/{model_name}', exist_ok=True)
    torch.save(query_embeddings, f'embedding_saves/{model_name}/query-{n}.pickle')
  # compute distances
  distances = scipy.spatial.distance.cdist(query_embeddings, doc_embeddings, "cosine")[0]
  # get the top k results
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])
  # Create a list of tuples with the document number and the distance
  results = [(preprocessed_documents[idx].doc_no, distance) for idx, distance in results[0:top_k]]
  return results

In [12]:
import pickle

# Compute the embeddings
for x, doc in enumerate(reversed(preprocessed_documents)):
  # Clear the cache
  torch.cuda.empty_cache()
  # skip the document if it exists, otherwise compute it
  if os.path.exists(f'embedding_saves/{model_name}/{doc.doc_no.strip()}.pickle'):
    continue
  else:
    os.makedirs(f'embedding_saves/{model_name}', exist_ok=True)
    # Calculate embedding for each document
    print(f'Embedding {doc.doc_no.strip()} {x}/{len(preprocessed_documents)}...')
    doc_embed = model.encode(doc.doc_text, show_progress_bar=False)
    # write the document embedding to a file
    with open(f'embedding_saves/{model_name}/{doc.doc_no.strip()}.pickle', 'wb') as f:
      pickle.dump(doc_embed, f)

Embedding AP881231-0067 79/79923...
Embedding AP881231-0066 80/79923...
Embedding AP881231-0065 81/79923...
Embedding AP881231-0064 82/79923...
Embedding AP881231-0063 83/79923...
Embedding AP881231-0062 84/79923...
Embedding AP881231-0061 85/79923...
Embedding AP881231-0060 86/79923...
Embedding AP881231-0059 87/79923...
Embedding AP881231-0058 88/79923...
Embedding AP881231-0057 89/79923...
Embedding AP881231-0056 90/79923...
Embedding AP881231-0055 91/79923...
Embedding AP881231-0054 92/79923...
Embedding AP881231-0053 93/79923...
Embedding AP881231-0052 94/79923...
Embedding AP881231-0051 95/79923...
Embedding AP881231-0050 96/79923...
Embedding AP881231-0049 97/79923...
Embedding AP881231-0048 98/79923...
Embedding AP881231-0047 99/79923...
Embedding AP881231-0046 100/79923...
Embedding AP881231-0045 101/79923...
Embedding AP881231-0044 102/79923...
Embedding AP881231-0043 103/79923...
Embedding AP881231-0042 104/79923...
Embedding AP881231-0041 105/79923...
Embedding AP881231-004

KeyboardInterrupt: 

# Compressed CSV

In [None]:
# import csv
# import gzip
# import os

# # assuming you have a list of Document objects called documents
# # and assuming you have already populated the vector attribute of each Document object

# # define the headers for your CSV file
# headers = ['doc_no', 'vector']

# # open the CSV file in 'w' mode and write the headers
# with open(f"embedding_saves/{model_name}.csv", mode='w', newline='') as file:
#   writer = csv.writer(file)
#   writer.writerow(headers)

#   # loop through each Document object and write its attributes to the CSV file
#   for x, document in enumerate(preprocessed_documents):
#     writer.writerow([document.doc_no, doc_embeddings[x]])

# # gzip the CSV file
# with open(f"embedding_saves/{model_name}.csv", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.csv.gz", 'wb') as f_out:
#     f_out.writelines(f_in)

# os.remove(f"embedding_saves/{model_name}.csv")

# Retrieval

In [None]:
import numpy as np
# Read all the embeddings from the files in the directory
doc_embeddings = []
for filename in os.listdir(f'embedding_saves/{model_name}'):
  with open(f'embedding_saves/{model_name}/{filename}', 'rb') as f:
    doc_embeddings.append(pickle.load(f))

# Save the embeddings
doc_embeddings = np.array(doc_embeddings)

In [None]:
# Go through all the documents and search for the top 1000 results
def query_retrieve(model, preprocessed_documents, doc_embeddings, descriptions=False, runid='runid', filename='Results.txt', top_k=1000):
  # Extract the topics
  topics = extract_topics('topics1-50.txt', descriptions)

  file_out = open(filename, 'w')

  for i, topic in enumerate(topics):
    # Search for the documents
    results = search(i, topic['title'], model, preprocessed_documents, doc_embeddings, top_k)
    for j, (doc_id, distance) in enumerate(results):
      file_out.write(f'{i+1} Q0 {doc_id.strip()} {j+1} {1-distance} {runid}\n')
  file_out.close()
  print('Written results to file ', filename)

In [None]:
query_retrieve(model, preprocessed_documents, doc_embeddings, descriptions=False, runid='runid', filename=f'Results-{model_name}.txt', top_k=1000)

Written results to file  Results-paraphrase-albert-small-v2.txt


In [None]:
!Powershell.exe -Command ".\trec_eval qrels1-50ap.txt Results-{model_name}.txt"

runid                 	all	runid
num_q                 	all	50
num_ret               	all	50000
num_rel               	all	2099
num_rel_ret           	all	566
map                   	all	0.0698
gm_map                	all	0.0110
Rprec                 	all	0.0884
bpref                 	all	0.1611
recip_rank            	all	0.4364
iprec_at_recall_0.00  	all	0.4567
iprec_at_recall_0.10  	all	0.1817
iprec_at_recall_0.20  	all	0.0991
iprec_at_recall_0.30  	all	0.0659
iprec_at_recall_0.40  	all	0.0512
iprec_at_recall_0.50  	all	0.0402
iprec_at_recall_0.60  	all	0.0276
iprec_at_recall_0.70  	all	0.0207
iprec_at_recall_0.80  	all	0.0164
iprec_at_recall_0.90  	all	0.0144
iprec_at_recall_1.00  	all	0.0124
P_5                   	all	0.2160
P_10                  	all	0.1700
P_15                  	all	0.1453
P_20                  	all	0.1270
P_30                  	all	0.1007
P_100                 	all	0.0508
P_200                 	all	0.0340
P_500                 	all	0.0179
P_1000                	al

In [None]:
from transformers import file_utils
print(file_utils.default_cache_path)

C:\Users\Howard/.cache\huggingface\hub
