In [1]:
import importlib


In [2]:
import pyterrier as pt
import pandas as pd
import os
import nltk
nltk.download('punkt')

from preprocessing import preprocess_directory, extract_topics, preprocess_text
pd.set_option('display.max_rows', None)
from trectools import TrecTopics, TrecTerrier, TrecIndri




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
if not pt.started():
  pt.init()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [4]:
# Function to generate the index
def generate_index():
  # Preprocess the collection
  preprocessed_documents = preprocess_directory('AP_collection/coll')
  print(preprocessed_documents)

  # Create a dataframe from the preprocessed documents
  df = pd.DataFrame.from_records([doc.to_dict() for doc in preprocessed_documents])
  df.head()

  # Create a Terrier index from the dataframe
  pd_indexer = pt.IterDictIndexer(os.path.abspath('./pd_index'), overwrite=True)
  indexref = pd_indexer.index(df.to_dict(orient='records'))

  return indexref

In [5]:
# Check if the index exists, if not create it
if not os.path.exists('./pd_index'):
  indexref = generate_index()
else:
  indexref = pt.IndexFactory.of(os.path.abspath('./pd_index/data.properties'))

In [6]:
# Create a BM25 retrieval model
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")


In [7]:
# result

## Testing - Inverted Index


In [8]:
# inverted_index = indexref.getInvertedIndex()
# meta = indexref.getMetaIndex()
# inv = indexref.getInvertedIndex()
# lex = indexref.getLexicon() # http://terrier.org/docs/current/javadoc/org/terrier/structures/Lexicon.html
# doi = indexref.getDocumentIndex()

# print(inverted_index.toString())
# # FIND FREQUENCY OF TERM IN INVERTED INDEX
# # le = lex.getLexiconEntry( "nation" ) # http://terrier.org/docs/current/javadoc/org/terrier/structures/LexiconEntry.html
# # # the lexicon entry is also our pointer to access the inverted index posting list
# # for posting in inv.getPostings( le ):
# #     docno = meta.getItem("docno", posting.getId())
# #     print("%s with frequency %d " % (docno, posting.getFrequency()))

# # Get the all the terms (aka lexicon entries)
# print(lex.numberOfEntries()) # 148709
# for i in range(0, 148708):
#     currLexiconEntryKey = lex.getIthLexiconEntry(i).getKey()
#     print(currLexiconEntryKey)
#     print(lex.getLexiconEntry(currLexiconEntryKey).toString())
#     print(lex.getLexiconEntry(currLexiconEntryKey).getNumberOfEntries())


In [9]:
# pointer = indexref.getLexicon()["document"]
# for posting in inverted_index.getPostings(pointer):
#     print(posting.toString() + " doclen=%d" % posting.getDocumentLength())

In [10]:
# use the BM25 model to index
result = bm25.search("Coping with overcrowded prisons")
print('BM25')
print(result)

BM25
    qid  docid          docno  rank      score  \
0     1   6513  AP880310-0051     0  22.712942   
1     1  25291  AP880519-0231     1  22.505621   
2     1  58415  AP881002-0014     2  21.529917   
3     1  78680  AP881225-0044     3  21.182791   
4     1  59135  AP881005-0001     4  20.994120   
5     1  63552  AP881021-0218     5  20.908434   
6     1  67730  AP881108-0076     6  20.707569   
7     1  78305  AP881223-0053     7  20.458164   
8     1  49748  AP880825-0054     8  20.425937   
9     1  56027  AP880921-0032     9  19.860570   
10    1  47053  AP880815-0061    10  19.835075   
11    1  56929  AP880926-0180    11  19.721177   
12    1  73364  AP881202-0169    12  18.618829   
13    1  72481  AP881129-0144    13  18.580205   
14    1  42690  AP880726-0173    14  18.295868   
15    1  41097  AP880720-0133    15  17.920571   
16    1  50813  AP880830-0013    16  17.542677   
17    1  47481  AP880816-0174    17  17.365506   
18    1  26872  AP880525-0245    18  17.17885

In [11]:
#print file out to Results 


In [12]:
# # Use the tf-idf retrieval model to index
# tfidf = pt.BatchRetrieve(indexref, wmodel="TF_IDF")
# result = tfidf.search("Coping with overcrowded prisons")
# print('\nTF-IDF')
# print(result)


In [13]:
preprocess_text("Cost of Garbage/Trash Removal")

'cost garbage trash remov'

In [14]:
# result_row = result.iloc[0]
# display(result_row)
# display(result_row['docno'])
# print(result_row['qid'] + " " + "Q0 " + result_row['docno'] + " " + str(result_row['rank']+1) + " " + str(result_row['score']) + " " + "runid")

topics = extract_topics("topics1-50.txt")
bm_file_out = open('Results.txt', 'w')

for i in range(len(topics)):
    print("topic " + str(i) +": " + topics[i])
    curr_result = bm25.search(preprocess_text((str(topics[i]))))
    for j in range(len(curr_result)):
        result_row = curr_result.iloc[j]
        bm_file_out.write(str(i+1) + " " + "Q0 " + result_row['docno'] + " " + str(result_row['rank']+1) + " " + str(result_row['score']) + " " + "runid\n")
        
bm_file_out.close()


topic 0: Coping with overcrowded prisons
topic 1: Accusations of Cheating by Contractors on U.S. Defense Projects
topic 2: Insurance Coverage which pays for Long Term Care
topic 3: Oil Spills
topic 4: Right Wing Christian Fundamentalism in U.S. 
topic 5: Efforts to enact Gun Control Legislation 
topic 6: Causes and treatments of multiple sclerosis (MS) 
topic 7: Term limitations for members of the U.S. Congress 
topic 8: Electric Car Development 
topic 9: Vitamins - The Cure for or Cause of
topic 10: Acid Rain  
topic 11: Automobile Recalls 
topic 12: Vietnam Veterans and Agent Orange 
topic 13: Generic Drugs - Illegal Activities by Manufacturers 
topic 14: Tobacco company advertising and the young 
topic 15: Standardized testing and cultural bias 
topic 16:  Topic: Regulation of the showing of violence and explicit
topic 17: Financing AMTRAK 
topic 18: Cost of Garbage/Trash Removal 
topic 19: The Consequences of Implantation of Silicone Gel 
topic 20: Use of Mutual Funds in an Individ

In [15]:
bm25.search("")

  warn("Skipping empty query for qid %s" % qid)


Unnamed: 0,docid,docno,rank,score,qid,query
