# Preprocessing

In [25]:
# Functions and classes for preprocessing the data
from itertools import chain
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import re
import os
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
class Document:
  def __init__(self, doc_no, doc_text, tokens):
    self.doc_no = doc_no
    self.doc_text = doc_text
    self.tokens = tokens

  def __str__(self):
    return 'Document Number: ' + self.doc_no + '\nDocument Text: ' + self.doc_text + '\nTokens: ' + str(self.tokens) + '\n'

  def to_dict(self):
    return {'docno': self.doc_no, 'doctext': self.doc_text, 'tokens': self.tokens, 'text': ' '.join(self.tokens)}

In [27]:
# Get the stop words
def get_stop_words():
  stopwords = set()
  # Open the stop words and add them to the set
  with open('StopWords.txt', 'r') as file:
    for line in file:
      stopwords.add(line.strip())
  return stopwords


# load the stopwords
stop_words = get_stop_words()

# function to perform preprocessing on the text
def preprocess(file):
  with open(file, "r") as f:
    content = f.read()
  documents = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
  preprocessed_documents = []
  for document in documents:
    # Get the document number and text
    raw_no = re.search(r'<DOCNO>(.*?)</DOCNO>', document, re.DOTALL)
    doc_no = raw_no.group(1) if raw_no else ''
    raw_text = re.search(r'<TEXT>(.*?)</TEXT>', document, re.DOTALL)
    doc_text = raw_text.string if raw_text else ''

    # create a document object
    doc = Document(doc_no, doc_text, [])
    preprocessed_documents.append(doc)
  return preprocessed_documents

# function to preprocess a single text string
def preprocess_text(text: str, stem=True, stopwords=True):
    # lowercase the text
  text = text.lower()

  # tokenize the text
  tokens = word_tokenize(text)
  # remove stopwords
  if stopwords:
    tokens = [token for token in tokens if token not in stop_words]
  # stem the tokens
  if stem:
    # apply the porter stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
  # remove punctuation
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  stripped = [w.translate(table) for w in tokens]
  stripped = list(chain(*[w.split() for w in stripped]))

  # remove empty tokens, stopwords (if applicable) and non-alphabetic tokens
  stripped = [
      token for token in stripped if token and (token not in stop_words if stopwords else True) and token.isalpha()]
  return stripped

# main function to preprocess a directory of text files
def preprocess_directory(directory, num_files=-1):
  preprocessed_documents = []
  ctr = 0
  for filename in os.listdir(directory):
    print('Preprocessing file: ', filename)
    file = os.path.join(directory, filename)
    preprocessed_documents.extend(preprocess(file))
    ctr += 1
    if ctr == num_files and num_files != -1:
      break
    
  print('preprocessed ', ctr, ' files')
  return preprocessed_documents

In [64]:
# Preprocess the collection
preprocessed_documents = preprocess_directory('AP_collection/coll')

Preprocessing file:  AP880212
Preprocessing file:  AP880213
Preprocessing file:  AP880214
Preprocessing file:  AP880215
Preprocessing file:  AP880216
Preprocessing file:  AP880217
Preprocessing file:  AP880218
Preprocessing file:  AP880219
Preprocessing file:  AP880220
Preprocessing file:  AP880221
Preprocessing file:  AP880222
Preprocessing file:  AP880223
Preprocessing file:  AP880224
Preprocessing file:  AP880225
Preprocessing file:  AP880226
Preprocessing file:  AP880227
Preprocessing file:  AP880228
Preprocessing file:  AP880229
Preprocessing file:  AP880301
Preprocessing file:  AP880302
Preprocessing file:  AP880303
Preprocessing file:  AP880304
Preprocessing file:  AP880307
Preprocessing file:  AP880308
Preprocessing file:  AP880309
Preprocessing file:  AP880310
Preprocessing file:  AP880311
Preprocessing file:  AP880312
Preprocessing file:  AP880313
Preprocessing file:  AP880314
Preprocessing file:  AP880315
Preprocessing file:  AP880316
Preprocessing file:  AP880317
Preprocess

In [29]:
len(preprocessed_documents)

79923

In [30]:
# function to extract the topics from the topics file
def extract_topics(file, descriptions=False):
  with open(file, "r") as f:
    topic_content = f.read()
  all_topics = []
  topics = re.findall(r'<top>(.*?)</top>', topic_content, re.DOTALL)
  for topic in topics:
    raw_title = re.search(r'<title>(.*?)\n\n', topic, re.DOTALL)
    title = raw_title.group(1) if raw_title else ''
    if descriptions:
      raw_desc = re.search(r'<desc>(.*?)\n\n', topic, re.DOTALL)
      desc = raw_desc.group(1) if raw_desc else ''
      all_topics.append({'title': title, 'description': desc})
    else:
      all_topics.append({'title': title})
  return all_topics

In [31]:
# Extract the topics
topics = extract_topics('topics1-50.txt', descriptions=True)

# Sentence transformer

In [45]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
NVIDIA GeForce GTX 1660 Ti


In [61]:
from sentence_transformers import SentenceTransformer
model_name='all-MiniLM-L6-v2'
model = SentenceTransformer(f'sentence-transformers/{model_name}', device='cuda:0')

In [68]:
import scipy

def search(query, model, preprocessed_documents, doc_embeddings, top_k=20):
  query_embeddings = model.encode([query])
  # compute distances
  distances = scipy.spatial.distance.cdist(query_embeddings, doc_embeddings, "cosine")[0]
  # get the top k results
  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])
  # Create a list of tuples with the document number and the distance
  results = [(preprocessed_documents[idx].doc_no, distance) for idx, distance in results[0:top_k]]
  return results

In [76]:
# generate doc embeddings for each document in preprocessed_documents
doc_embeddings = model.encode([doc.doc_text for doc in preprocessed_documents])

# Compressed CSV

In [62]:
import csv
import gzip
import os

# assuming you have a list of Document objects called documents
# and assuming you have already populated the vector attribute of each Document object

# define the headers for your CSV file
headers = ['doc_no', 'vector']

# open the CSV file in 'w' mode and write the headers
with open(f"embedding_saves/{model_name}.csv", mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(headers)

  # loop through each Document object and write its attributes to the CSV file
  for x, document in enumerate(preprocessed_documents):
    writer.writerow([document.doc_no, doc_embeddings[x]])

# gzip the CSV file
with open(f"embedding_saves/{model_name}.csv", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.csv.gz", 'wb') as f_out:
    f_out.writelines(f_in)

os.remove(f"embedding_saves/{model_name}.csv")

In [80]:
# gzip the CSV file
with open(f"embedding_saves/{model_name}.csv", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.csv.gz", 'wb') as f_out:
    f_out.writelines(f_in)

os.remove(f"embedding_saves/{model_name}.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'embedding_saves/all-MiniLM-L6-v2.csv'

# Compressed Pickle

In [63]:
import pickle

# store the embeddings in a pickle file
with open(f"embedding_saves/{model_name}.pickle", 'wb') as f:
  pickle.dump(doc_embeddings, f)

# gzip the pickle file
with open(f"embedding_saves/{model_name}.pickle", 'rb') as f_in, gzip.open(f"embedding_saves/{model_name}.pickle.gz", 'wb') as f_out:
    f_out.writelines(f_in)

In [74]:
import numpy as np

# unzip the pickle file 
with gzip.open(f"embedding_saves/{model_name}.pickle.gz", 'rb') as f_in:
    doc_embeddings = pickle.load(f_in)

# Retrieval

In [75]:
# Go through all the documents and search for the top 1000 results
topic = topics[0]
print(search(topic['title'], model, preprocessed_documents, doc_embeddings, top_k=10))

[(' AP880414-0081 ', 0.4201424020644551), (' AP880827-0092 ', 0.4220602943873162), (' AP880704-0017 ', 0.431396290661078), (' AP880926-0180 ', 0.46072499905200504), (' AP880607-0210 ', 0.4607841600896331), (' AP881021-0218 ', 0.4746619157266405), (' AP880608-0082 ', 0.47973523180529054), (' AP881128-0234 ', 0.48361827396065116), (' AP881006-0202 ', 0.49856607139589115), (' AP881004-0180 ', 0.5088096331984095)]
