In [None]:
import pyterrier as pt
import pandas as pd
import os
import nltk
nltk.download('punkt')
from preprocessing import preprocess_directory

In [None]:
if not pt.started():
  pt.init()

In [None]:
# Function to generate the index
def generate_index():
  # Preprocess the collection
  preprocessed_documents = preprocess_directory('AP_collection/coll')

  # Create a dataframe from the preprocessed documents
  df = pd.DataFrame.from_records([doc.to_dict() for doc in preprocessed_documents])
  df.head()

  # Create a Terrier index from the dataframe
  pd_indexer = pt.IterDictIndexer(os.path.abspath('./pd_index'), overwrite=True)
  indexref = pd_indexer.index(df.to_dict(orient='records'))

  return indexref

In [None]:
# Check if the index exists, if not create it
if not os.path.exists('./pd_index'):
  indexref = generate_index()
else:
  indexref = pt.IndexFactory.of(os.path.abspath('./pd_index/data.properties'))

In [None]:
# Create a BM25 retrieval model
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")

In [None]:
# use the BM25 model to index
result = bm25.search("Coping with overcrowded prisons")
print('BM25')
print(result)

In [None]:
#print file out to Results 
bm_file_out = open('Results.txt', 'w')
bm_file_out.write(result.to_string())
bm_file_out.close()

In [None]:
# Use the tf-idf retrieval model to index
tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF")
result = tfidf.search("Coping with overcrowded prisons")
print('\nTF-IDF')
print(result)