In [None]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install sentence_transformers

In [2]:
import wget
wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/government.zip", "government.zip")

'government.zip'

In [None]:
!unzip government.zip

In [51]:
# imports
# Put all your imports here
import whoosh
from whoosh import qparser, scoring, query, index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
from whoosh.analysis import Filter

import numpy as np
import nltk
from nltk.stem import *
nltk.download("wordnet")
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()
from sentence_transformers import SentenceTransformer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import json
import pandas as pd

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        # DON'T change the following definitions for topic_file, qrels_file, document_dir, file_list
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def map(results):
      return pytrec_eval.compute_aggregated_measure('map', [query_measures['map'] for query_measures in results.values()])

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))


    def score(self,docnum,topic_results, topic_phrase):
        return topic_results.score(docnum)


    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                    topic_with_result = topic_id


        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        #fill results dictionary with queries that were returned 0 documents
        topic_ids = {t.split()[0] for t in topics}
        for emptyresult_topicid in topic_ids.difference(set(results.keys())):
            num_rel = float(sum(qrel[emptyresult_topicid].values()))
            if num_rel>0:
              topic_stats={measure:0.0 for measure in results[topic_with_result]}
            else:
              topic_stats={measure:1.0 for measure in results[topic_with_result]}
            topic_stats["num_rel"]=num_rel
            topic_stats["num_ret"] = 0.0
            topic_stats["num_rel_ret"] = 0.0
            topic_stats["num_q"]=1.0

            results[emptyresult_topicid] = topic_stats

        self.print_trec_eval_result(results)

In [53]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self, other):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

# Dont change this! Use it as-is in your code if you rerank your results using a non-Whoosh scoring function
class NeuralResults():
  '''
  This class is used to rerank documents returned by whoosh in an interface that
  imitates whoosh.searching.Results (the datatype of topicResults in pyTrecEval)
  '''
  def __init__(self, booleansearchdocs,scores,rankings, file_list):
    self.results=[]
    if rankings.shape:
      for idx in rankings:
        self.results.append({'file_path':file_list[booleansearchdocs[idx]],'score':scores[idx] })

  def score(self,docnum):
    return self.results[docnum]['score']

  def __iter__(self):
    return self.results.__iter__()

## Question 4

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

**6. If you are doing neural IR you should precompute your corpus embeddings and save them in the corpus_embeddings.json file. If you do this, please keep the code used to generate the embeddings somewhere in this notebook**

In [58]:
# Without Neural IR
# class IRQ4(IRSystem):
#     def create_index(self):
#         """
#         INPUT:
#             None
#         OUTPUT:
#             None

#         NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
#         """
#         # DON't change the name of 'index_sys'
#         self.index_sys = index.create_in(tempfile.mkdtemp(), Schema(file_path = ID(stored=True),
#                   file_content = TEXT(analyzer = RegexTokenizer() |
#                                       LowercaseFilter() |
#                                       IntraWordFilter() |
#                                       StopFilter() |
#                                       CustomFilter(LancasterStemmer().stem)
#                                       )))

#     def add_files(self):
#         """
#         INPUT:
#             None
#         OUTPUT:
#             None

#         NOTE: Add buffer to self.index_sys
#         """
#         writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
#         try:
#           # write each file to index
#           for docNum, filePath in enumerate(self.file_list):
#             with open(filePath, "r", encoding="utf-8") as f:
#               fileContent = f.read()

#               writer.add_document(file_path = filePath,
#                                   file_content = fileContent)

#               # print status every 1000 documents
#               if (docNum+1) % 1000 == 0:
#                 print("already indexed:", docNum+1)
#           print("done indexing.")

#         finally:
#           # close the index
#           writer.close()
#         self.create_parser_searcher()

#     def create_parser_searcher(self):
#         """
#         INPUT:
#             None
#         OUTPUT:
#             None

#         NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
#         """
#          # DON't change the names of 'query_parser' and 'searcher'
#         self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
#         self.searcher = self.index_sys.searcher(weighting=scoring.PL2(c=1.9))

#     def perform_search(self, topic_phrase):
#         """
#         INPUT:
#             topic_phrase: string
#         OUTPUT:
#             topicResults: whoosh.searching.Results

#         NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
#         """
#         topic_results = self.searcher.search(self.query_parser.parse(topic_phrase), limit=None)
#         return topic_results

In [24]:
#  RegexTokenizer() | LowercaseFilter()
"""
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.2365 -> 0.2547868766118194

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP: 0.2547868766118194


In [29]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter()
"""
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.2547868766118194 -> 0.2547868766118194
# Doesn't help, but doesn't hurt either...

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP: 0.2547868766118194


In [31]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter
"""
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.2547868766118194 -> 0.2743914120832279

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP: 0.2743914120832279


In [33]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter | CustomFilter(WordNetLemmatizer().lemmatize)
"""
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.2743914120832279 -> 0.3469207683432995

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP: 0.3469207683432995


In [35]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter | CustomFilter(CustomFilter(LancasterStemmer().stem))
"""
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.3469207683432995 -> 0.3587706588011549

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP: 0.3587706588011549


In [37]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter | CustomFilter(CustomFilter(LancasterStemmer().stem))
# Use OrGroup instead of default AndGroup
"""
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.3587706588011549 -> 0.3760774365417277

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP: 0.3760774365417277


In [60]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter | CustomFilter(CustomFilter(LancasterStemmer().stem))
# Use OrGroup instead of default AndGroup
# Grid search over B and K1 in BM25F
"""
q4 = IRQ4("government")
q4.add_files()
maxm = 0
for B in np.arange(0.1,1,0.1):
  for K1 in np.arange(0.1,3,0.1):
    q4.create_parser_searcher(B=B, K1=K1)
    results = q4.py_trec_eval(B=B, K1=K1)
    maxm = max(maxm, q4.map(results))
    print(f"MAP (B = {B}, K1 = {K1}): {q4.map(results)}")
print(maxm)
"""
# 0.3760774365417277 -> 0.38310537270905193

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP (B = 0.1, K1 = 0.1): 0.25932156153353814
MAP (B = 0.1, K1 = 0.2): 0.25880023835662674
MAP (B = 0.1, K1 = 0.30000000000000004): 0.2577014515306522
MAP (B = 0.1, K1 = 0.4): 0.2571598169047828
MAP (B = 0.1, K1 = 0.5): 0.25722719412699485
MAP (B = 0.1, K1 = 0.6): 0.2571869634287099
MAP (B = 0.1, K1 = 0.7000000000000001): 0.2574766127351131
MAP (B = 0.1, K1 = 0.8): 0.25748496882716787
MAP (B = 0.1, K1 = 0.9): 0.25765108609690296
MAP (B = 0.1, K1 = 1.0): 0.25774477931020245
MAP (B = 0.1, K1 = 1.1): 0.2578392656122613
MAP (B = 0.1, K1 = 1.2000000000000002): 0.25786447386501704
MAP (B = 0.1, K1 = 1.3000000000000003): 0.2579736737731006
MAP (B = 0.1, K1 = 1.4000000000000001): 0.25814587031696395
MAP (B = 0.1, K1 = 1.5000000000000002): 0.25820110699411397
MAP (B = 0.1, K1 = 1.6): 0.25743735243810223
MAP (B = 0.1, K1 = 1.7000000000000002): 0.2580636761485793
MAP (B = 0.1, K1 = 1.800000000000

In [49]:
#  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter | CustomFilter(CustomFilter(LancasterStemmer().stem))
# Use OrGroup instead of default AndGroup
# Use PL2 instead of deafult BM25F, grid search over c
"""
q4 = IRQ4("government")
q4.add_files()
for c in [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]:
  q4.create_parser_searcher(c=c)
  results = q4.py_trec_eval(c=c)
  print(f"MAP (c = {c}): {q4.map(results)}")
"""
# 0.38310537270905193 -> 0.3864733792647442

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
MAP (c = 1): 0.3521419993244009
MAP (c = 1.1): 0.35242763728779003
MAP (c = 1.2): 0.35310584146099494
MAP (c = 1.3): 0.35319962147766165
MAP (c = 1.4): 0.35327375422003726
MAP (c = 1.5): 0.3532027150743758
MAP (c = 1.6): 0.3861560076019423
MAP (c = 1.7): 0.3858800010203761
MAP (c = 1.8): 0.38575225309408706
MAP (c = 1.9): 0.3864733792647442
MAP (c = 2.0): 0.38645441227438065
MAP (c = 2.1): 0.3864578316266612
MAP (c = 2.2): 0.38633652208888325
MAP (c = 2.3): 0.38636719585709856
MAP (c = 2.4): 0.3864511014986677
MAP (c = 2.5): 0.38641212460913205
MAP (c = 2.6): 0.38608822814209676
MAP (c = 2.7): 0.3860351771770743
MAP (c = 2.8): 0.3857811994702276
MAP (c = 2.9): 0.38574605172488957
MAP (c = 3.0): 0.3857903696646098


In [59]:
class IRQ4(IRSystem):
    def __init__(self, data_dir):
        super(IRQ4, self).__init__(data_dir)
        self.query_modelname = "sentence-transformers/msmarco-MiniLM-L12-cos-v5"
        self.doc_modelname = "sentence-transformers/msmarco-MiniLM-L12-cos-v5"
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.doc_model = SentenceTransformer(self.doc_modelname).to(self.device)
        self.query_model = SentenceTransformer(self.query_modelname).to(self.device)
        self.expand_query_bool = False
        self.expand_doc_bool = False
        self.neuralrerank=True

    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        substitutions = (SubstitutionFilter(r"(?:[\w]*).(jpg|gif)", "") | SubstitutionFilter(r"therapy", "therap") | SubstitutionFilter(r"children", "child") |
                    SubstitutionFilter(r"nepo", "nuclear energy plant optimization") | SubstitutionFilter(r"osmre", "office surface mining reclamation enforcement") |
                    SubstitutionFilter(r"msha", "mine safety health administration") | SubstitutionFilter(r"cya", "california youth juvenile authority") |
                    SubstitutionFilter(r"ptb", "physical therapy board") | SubstitutionFilter(r"pt", "physical therapist") |
                    SubstitutionFilter(r"pcpfs", "president council physical fitness sports") | SubstitutionFilter(r"insc", "international nuclear safety center") |
                    SubstitutionFilter(r"ccfc", "california children family commission") | SubstitutionFilter(r"pswn", "public safety wireless network") |
                    SubstitutionFilter(r"swcgrl", "southwestern cotton ginning research laboratory") | SubstitutionFilter(r"ifccfbi", "internet fraud complaint center fbi") |
                    SubstitutionFilter(r"ndpo", "national domestic preparedness office") | SubstitutionFilter(r"fmshrc", "federal mine safety health review commission") |
                    SubstitutionFilter(r"php", "page emergency preparedness") | SubstitutionFilter(r"arp", "accident reduction program") |
                    SubstitutionFilter(r"usgs", "geological survey") | SubstitutionFilter(r"usdoj", "department of justice") |
                    SubstitutionFilter(r"ojjdp", "office juvenile justice delinquency prevention") | SubstitutionFilter(r"ptbc", "physical therapy board") |
                    SubstitutionFilter(r"ptb", "physical therapy board") | SubstitutionFilter(r"insc", "international nuclear safety center") |
                    SubstitutionFilter(r"doe", "department energy") |  SubstitutionFilter(r"ccfc", "canadian child care foundation") |
                    SubstitutionFilter(r"acyf", "administration children youth families") | SubstitutionFilter(r"acf", "administration children families") |
                    SubstitutionFilter(r"dhhs", "department health human services") | SubstitutionFilter(r"chssco", "california head start state collaboration office") |
                    SubstitutionFilter(r"nhsa", "national head start association") | SubstitutionFilter(r"naeyc", "national association education of young children") |
                    SubstitutionFilter(r"nccic", "national child care information center") | SubstitutionFilter(r"npwrc", "northern prairie wildlife research center") |
                    SubstitutionFilter(r"nas", "nonindigenous aquatic species") | SubstitutionFilter(r"insc", "international nuclear safety center") |
                    SubstitutionFilter(r"nisc", "national invasive species council") |
                    SubstitutionFilter(r"esa", "endangered species act") | SubstitutionFilter(r"noaa", "national oceanic atmospheric administration") |
                    SubstitutionFilter(r"usfws", "fish wildlife service") | SubstitutionFilter(r"nbii", "national biological information infrastructure") |
                    SubstitutionFilter(r"nasa", "national aeronautics space administration") | SubstitutionFilter(r"gcmd", "global change master directory") |
                    SubstitutionFilter(r"gcmd", "global change master directory") | SubstitutionFilter(r"afv", "alternative fuel vehicle") |
                    SubstitutionFilter(r"cdf", "california department forestry") | SubstitutionFilter("yes", "") | SubstitutionFilter("no", "")
        )
        self.index_sys = index.create_in(tempfile.mkdtemp(), Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer() |
                                      LowercaseFilter() |
                                      IntraWordFilter() |
                                      StopFilter() |
                                      substitutions |
                                      CustomFilter(LancasterStemmer().stem)
                                      )))

    def set_neural_rerank(self, val):
        self.neuralrerank=val

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        file_contents = []
        try:
          # write each file to index
          for docNum, filePath in enumerate(self.file_list):
            with open(filePath, "r", encoding="utf-8") as f:
              fileContent = f.read()
              if self.expand_doc_bool:
                  fileContent = self.expand_doc(fileContent)
              file_contents.append(fileContent)

              writer.add_document(file_path = filePath,
                                  file_content = fileContent)

              # print status every 1000 documents
              if (docNum+1) % 1000 == 0:
                print(f"already indexed: {docNum+1}")
          print("done indexing.")

        finally:
          # close the index
          writer.close()

        print("Computing Embeddings")
        if not os.path.exists(f'./corpus_embeddings.json'):
            class CustomDataset(Dataset):
                def __init__(self, data_dir):
                    self.document_dir = os.path.join(data_dir, "documents")
                    self.data = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

                def __len__(self):
                    return len(self.data)

                def __getitem__(self, idx):
                    # def reverse_document(document):
                    #     # Split the document into words
                    #     words = document.split()
                    #     # Reverse the list of words
                    #     reversed_words = words[::-1]
                    #     # Join the reversed words to form the reversed document
                    #     reversed_document = ' '.join(reversed_words)
                    #     return reversed_document
                    return open(self.data[idx], "r", encoding="utf-8").read()

            data_loader = DataLoader(CustomDataset("government"), batch_size=128, shuffle=False)
            embeddings=[]

            for data in data_loader:
                e=self.doc_model.encode(
                    data,
                    convert_to_tensor=True,
                    show_progress_bar=True,
                    normalize_embeddings=True
                )
                embeddings.append(e)
            self.corpus_embeddings = torch.cat(embeddings, axis=0)

            with open('corpus_embeddings.json', 'w') as json_file:
                dict_corpus_embeddings=dict(zip(self.file_list, self.corpus_embeddings.tolist()))
                json.dump(dict_corpus_embeddings, json_file)
        else:
            self.corpus_embeddings = torch.Tensor(pd.read_json('corpus_embeddings.json')[self.file_list].T.values)

        self.create_parser_searcher()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
        self.searcher = self.index_sys.searcher(weighting=scoring.PL2(c=1))

    # def expand_query1(self, query):
    #     nlp = spacy.load("en_core_web_sm")

    #     def get_synonyms(word):
    #         synonyms = set()
    #         for syn in wordnet.synsets(word):
    #             for lemma in syn.lemmas():
    #                 synonyms.add(lemma.name())
    #         return list(synonyms)

    #     doc = nlp(query)

    #     expanded_query = []

    #     # Loop through tokens in the query
    #     for token in doc:
    #         # Get synonyms for nouns, verbs, adjectives, and adverbs
    #         if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]:
    #             synonyms = get_synonyms(token.text)
    #             if synonyms:
    #                 expanded_query.extend(synonyms)

    #         # Add the original token to the expanded query
    #         expanded_query.append(token.text)

        # Join the expanded query back into a string
        # return " ".join(expanded_query)

    # def expand_query2(self, query, topn):
    #   word2vec_model = Word2Vec.load("path/to/word2vec/model")
    #   nlp = spacy.load("en_core_web_sm")
    #   query_tokens = [token.text for token in nlp(query.lower()) if not token.is_stop]

    #   expanded_query = query_tokens.copy()
    #   for token in query_tokens:
    #       similar_words = word2vec_model.wv.most_similar(token, topn=topn)
    #       for word, _ in similar_words:
    #           if word not in expanded_query:
    #               expanded_query.append(word)
    #   return expanded_query

    # def expand_query2(self, query, num_synonyms=1):
    #     # Tokenize the query
    #     tokens = query.split()

    #     # Initialize an empty list to store the expanded query terms
    #     expanded_terms = []

    #     # Iterate over each token in the query
    #     for token in tokens:
    #         # Add the original token to the expanded query
    #         expanded_terms.append(f"{token}")

    #         # Retrieve synonyms for the token from WordNet
    #         synonyms = set()
    #         for syn in wordnet.synsets(token):
    #             for lemma in syn.lemmas():
    #                 synonyms.add(lemma.name())

    #         # Add a limited number of synonyms to the expanded query with a weighted boost
    #         for synonym in list(synonyms)[:num_synonyms]:
    #             expanded_terms.append(f"{synonym}")

    #     # Combine the expanded terms into a single query string
    #     expanded_query = " ".join(expanded_terms)

    #     return expanded_query

    # def expand_doc(self, doc_text):
    #     tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
    #     model = T5ForConditionalGeneration.from_pretrained('castorini/doc2query-t5-base-msmarco')
    #     model.to(self.device)
    #     input_ids = tokenizer.encode(doc_text, truncation=True, return_tensors='pt').to(self.device)
    #     outputs = model.generate(
    #         input_ids=input_ids,
    #         max_length=64,
    #         do_sample=True,
    #         top_k=10,
    #         num_return_sequences=3)
    #     for i in range(3):
    #         doc_text += " " + tokenizer.decode(outputs[i], skip_special_tokens=True)

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topic_results = self.searcher.search(self.query_parser.parse(topic_phrase), limit=None)
        if self.neuralrerank: #if we want to rerank the retrieved documents
            booleansearchdocs = list(topic_results.docs()) #get the retrieved docs
            if len(booleansearchdocs)<=1: #if 0 or 1 retrieved docs then reranking is not necessary
              return topic_results
            if self.expand_query_bool:
                topic_phrase = self.expand_query2(topic_phrase, 1)
            query_embedding = self.query_model.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True) #embed querry
            scores = torch.mm(query_embedding.unsqueeze(0).to(self.device), self.corpus_embeddings[booleansearchdocs].T.to(self.device)).squeeze() #compute dot product between querry embedding and document embeddings of the returned docs
            rankings = torch.argsort(scores,descending=True) #get order of scores by index
            return NeuralResults(booleansearchdocs,scores, rankings, self.file_list)
        else:
            return topic_results

In [8]:
"""
model_name = "sentence-transformers/all-mpnet-base-v2"
q4 = IRQ4("government", model_name, model_name, False, False)
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.5134497774735717

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

already indexed: 100
already indexed: 200
already indexed: 300
already indexed: 400
already indexed: 500
already indexed: 600
already indexed: 700
already indexed: 800
already indexed: 900
already indexed: 1000
already indexed: 1100
already indexed: 1200
already indexed: 1300
already indexed: 1400
already indexed: 1500
already indexed: 1600
already indexed: 1700
already indexed: 1800
already indexed: 1900
already indexed: 2000
already indexed: 2100
already indexed: 2200
already indexed: 2300
already indexed: 2400
already indexed: 2500
already indexed: 2600
already indexed: 2700
already indexed: 2800
already indexed: 2900
already indexed: 3000
already indexed: 3100
already indexed: 3200
already indexed: 3300
already indexed: 3400
already indexed: 3500
already indexed: 3600
already indexed: 3700
already indexed: 3800
already indexed: 3900
already indexed: 4000
done indexing.
Computing Embeddings


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MAP: 0.5134497774735717


In [17]:
"""
model_name = "sentence-transformers/msmarco-MiniLM-L12-cos-v5"
q4 = IRQ4("government", model_name, model_name, False, False)
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# 0.5354344000219047

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
Computing Embeddings


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MAP: 0.5354344000219047


In [43]:
"""
model_name = "sentence-transformers/paraphrase-albert-small-v2"
q4 = IRQ4("government", model_name, model_name, False, False)
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# sad life

Downloading (…)f333f/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d423f333f/README.md:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading (…)423f333f/config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)f333f/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)23f333f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
Computing Embeddings


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MAP: 0.3734810971323267


In [21]:
# what if we reverse the document?
"""
model_name = "sentence-transformers/msmarco-MiniLM-L12-cos-v5"
q4 = IRQ4("government", model_name, model_name, False, False)
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# ...Terrible idea

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
Computing Embeddings


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MAP: 0.3007016726193314


In [60]:
# add substitutions
model_name = "sentence-transformers/msmarco-MiniLM-L12-cos-v5"
q4 = IRQ4("government")
q4.add_files()
results = q4.py_trec_eval()
# went up very slightly

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.
Computing Embeddings
num_q                    1       1.0000
num_ret                  1       405.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0778
gm_map                   1       -2.5538
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.1667
iprec_at_recall_0.00     1       0.1667
iprec_at_recall_0.10     1       0.1667
iprec_at_recall_0.20     1       0.1667
iprec_at_recall_0.30     1       0.0968
iprec_at_recall_0.40     1       0.0968
iprec_at_recall_0.50     1       0.0968
iprec_at_recall_0.60     1       0.0968
iprec_at_recall_0.70     1       0.0197
iprec_at_recall_0.80     1       0.0197
iprec_at_recall_0.90     1       0.0149
iprec_at_recall_1.00     1       0.0149
P_5                      1       0.0000
P_10                     1       

In [40]:
# class IRGPT(IRSystem):
#     def __init__(self, data_dir, modelname, expand_query, expand_doc):
#         from transformers import AutoTokenizer, AutoModel
#         super(IRGPT, self).__init__(data_dir)
#         self.modelname = modelname
#         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.tokenizer.pad_token = self.tokenizer.eos_token
#         self.model = AutoModel.from_pretrained(model_name).to(self.device)
#         self.expand_query_bool = expand_query
#         self.expand_doc_bool = expand_doc
#         self.neuralrerank=True

#     def create_index(self):
#         """
#         INPUT:
#             None
#         OUTPUT:
#             None

#         NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
#         """
#         # DON't change the name of 'index_sys'
#         substitutions = (SubstitutionFilter(r"(?:[\w]*).(jpg|gif)", "") | SubstitutionFilter(r"therapy", "therap") | SubstitutionFilter(r"children", "child") |
#                     SubstitutionFilter(r"nepo", "nuclear energy plant optimization") | SubstitutionFilter(r"osmre", "office surface mining reclamation enforcement") |
#                     SubstitutionFilter(r"msha", "mine safety health administration") | SubstitutionFilter(r"cya", "california youth juvenile authority") |
#                     SubstitutionFilter(r"ptb", "physical therapy board") | SubstitutionFilter(r"pt", "physical therapist") |
#                     SubstitutionFilter(r"pcpfs", "president council physical fitness sports") | SubstitutionFilter(r"insc", "international nuclear safety center") |
#                     SubstitutionFilter(r"ccfc", "california children family commission") | SubstitutionFilter(r"pswn", "public safety wireless network") |
#                     SubstitutionFilter(r"swcgrl", "southwestern cotton ginning research laboratory") | SubstitutionFilter(r"ifccfbi", "internet fraud complaint center fbi") |
#                     SubstitutionFilter(r"ndpo", "national domestic preparedness office") | SubstitutionFilter(r"fmshrc", "federal mine safety health review commission") |
#                     SubstitutionFilter(r"php", "page emergency preparedness") | SubstitutionFilter(r"arp", "accident reduction program") |
#                     SubstitutionFilter(r"usgs", "geological survey") | SubstitutionFilter(r"usdoj", "department of justice") |
#                     SubstitutionFilter(r"ojjdp", "office juvenile justice delinquency prevention") | SubstitutionFilter(r"ptbc", "physical therapy board") |
#                     SubstitutionFilter(r"ptb", "physical therapy board") | SubstitutionFilter(r"insc", "international nuclear safety center") |
#                     SubstitutionFilter(r"doe", "department energy") |  SubstitutionFilter(r"ccfc", "canadian child care foundation") |
#                     SubstitutionFilter(r"acyf", "administration children youth families") | SubstitutionFilter(r"acf", "administration children families") |
#                     SubstitutionFilter(r"dhhs", "department health human services") | SubstitutionFilter(r"chssco", "california head start state collaboration office") |
#                     SubstitutionFilter(r"nhsa", "national head start association") | SubstitutionFilter(r"naeyc", "national association education of young children") |
#                     SubstitutionFilter(r"nccic", "national child care information center") | SubstitutionFilter(r"npwrc", "northern prairie wildlife research center") |
#                     SubstitutionFilter(r"nas", "nonindigenous aquatic species") | SubstitutionFilter(r"insc", "international nuclear safety center") |
#                     SubstitutionFilter(r"nisc", "national invasive species council") |
#                     SubstitutionFilter(r"esa", "endangered species act") | SubstitutionFilter(r"noaa", "national oceanic atmospheric administration") |
#                     SubstitutionFilter(r"usfws", "fish wildlife service") | SubstitutionFilter(r"nbii", "national biological information infrastructure") |
#                     SubstitutionFilter(r"nasa", "national aeronautics space administration") | SubstitutionFilter(r"gcmd", "global change master directory") |
#                     SubstitutionFilter(r"gcmd", "global change master directory") | SubstitutionFilter(r"afv", "alternative fuel vehicle") |
#                     SubstitutionFilter(r"cdf", "california department forestry") | SubstitutionFilter("yes", "") | SubstitutionFilter("no", "")
#         )
#         self.index_sys = index.create_in(tempfile.mkdtemp(), Schema(file_path = ID(stored=True),
#                   file_content = TEXT(analyzer = RegexTokenizer() |
#                                       LowercaseFilter() |
#                                       IntraWordFilter() |
#                                       StopFilter() |
#                                       substitutions |
#                                       CustomFilter(LancasterStemmer().stem)
#                                       )))

#     def set_neural_rerank(self, val):
#         self.neuralrerank=val

#     def add_files(self):
#         """
#         INPUT:
#             None
#         OUTPUT:
#             None

#         NOTE: Add buffer to self.index_sys
#         """
#         writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
#         file_contents = []
#         try:
#           # write each file to index
#           for docNum, filePath in enumerate(self.file_list[:10]):
#             with open(filePath, "r", encoding="utf-8") as f:
#               fileContent = f.read()
#               if self.expand_doc_bool:
#                   fileContent = self.expand_doc(fileContent)
#               file_contents.append(fileContent)

#               writer.add_document(file_path = filePath,
#                                   file_content = fileContent)

#               # print status every 1000 documents
#               if (docNum+1) % 1000 == 0:
#                 print(f"already indexed: {docNum+1}")
#           print("done indexing.")

#         finally:
#           # close the index
#           writer.close()

#         print("Computing Embeddings")
#         if not os.path.exists(f'./corpus_embeddings.json') or True:
#             class CustomDataset(Dataset):
#                 def __init__(self, data_dir):
#                     self.document_dir = os.path.join(data_dir, "documents")
#                     self.data = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

#                 def __len__(self):
#                     return len(self.data)

#                 def __getitem__(self, idx):
#                     return open(self.data[idx], "r", encoding="utf-8").read()

#             data_loader = DataLoader(CustomDataset("government"), batch_size=128, shuffle=False)
#             embeddings=[]

#             for data in data_loader:
#                 tokens = self.tokenizer(data, return_tensors="pt", padding=True, truncation=True).to(self.device)

#                 e=self.model(
#                     **tokens,
#                 ).last_hidden_state
#                 embeddings.append(e)
#             self.corpus_embeddings = torch.cat(embeddings, axis=0)

#             # with open('corpus_embeddings.json', 'w') as json_file:
#             #     dict_corpus_embeddings=dict(zip(self.file_list, self.corpus_embeddings.tolist()))
#             #     json.dump(dict_corpus_embeddings, json_file)
#         else:
#             self.corpus_embeddings = torch.Tensor(pd.read_json('corpus_embeddings.json')[self.file_list].T.values)

#         self.create_parser_searcher()

#     def create_parser_searcher(self):
#         """
#         INPUT:
#             None
#         OUTPUT:
#             None

#         NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
#         """
#          # DON't change the names of 'query_parser' and 'searcher'
#         self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
#         self.searcher = self.index_sys.searcher(weighting=scoring.PL2(c=1))

#     # def expand_query1(self, query):
#     #     nlp = spacy.load("en_core_web_sm")

#     #     def get_synonyms(word):
#     #         synonyms = set()
#     #         for syn in wordnet.synsets(word):
#     #             for lemma in syn.lemmas():
#     #                 synonyms.add(lemma.name())
#     #         return list(synonyms)

#     #     doc = nlp(query)

#     #     expanded_query = []

#     #     # Loop through tokens in the query
#     #     for token in doc:
#     #         # Get synonyms for nouns, verbs, adjectives, and adverbs
#     #         if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]:
#     #             synonyms = get_synonyms(token.text)
#     #             if synonyms:
#     #                 expanded_query.extend(synonyms)

#     #         # Add the original token to the expanded query
#     #         expanded_query.append(token.text)

#         # Join the expanded query back into a string
#         # return " ".join(expanded_query)

#     # def expand_query2(self, query, topn):
#     #   word2vec_model = Word2Vec.load("path/to/word2vec/model")
#     #   nlp = spacy.load("en_core_web_sm")
#     #   query_tokens = [token.text for token in nlp(query.lower()) if not token.is_stop]

#     #   expanded_query = query_tokens.copy()
#     #   for token in query_tokens:
#     #       similar_words = word2vec_model.wv.most_similar(token, topn=topn)
#     #       for word, _ in similar_words:
#     #           if word not in expanded_query:
#     #               expanded_query.append(word)
#     #   return expanded_query

#     # def expand_query2(self, query, num_synonyms=1):
#     #     # Tokenize the query
#     #     tokens = query.split()

#     #     # Initialize an empty list to store the expanded query terms
#     #     expanded_terms = []

#     #     # Iterate over each token in the query
#     #     for token in tokens:
#     #         # Add the original token to the expanded query
#     #         expanded_terms.append(f"{token}")

#     #         # Retrieve synonyms for the token from WordNet
#     #         synonyms = set()
#     #         for syn in wordnet.synsets(token):
#     #             for lemma in syn.lemmas():
#     #                 synonyms.add(lemma.name())

#     #         # Add a limited number of synonyms to the expanded query with a weighted boost
#     #         for synonym in list(synonyms)[:num_synonyms]:
#     #             expanded_terms.append(f"{synonym}")

#     #     # Combine the expanded terms into a single query string
#     #     expanded_query = " ".join(expanded_terms)

#     #     return expanded_query

#     # def expand_doc(self, doc_text):
#     #     tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
#     #     model = T5ForConditionalGeneration.from_pretrained('castorini/doc2query-t5-base-msmarco')
#     #     model.to(self.device)
#     #     input_ids = tokenizer.encode(doc_text, truncation=True, return_tensors='pt').to(self.device)
#     #     outputs = model.generate(
#     #         input_ids=input_ids,
#     #         max_length=64,
#     #         do_sample=True,
#     #         top_k=10,
#     #         num_return_sequences=3)
#     #     for i in range(3):
#     #         doc_text += " " + tokenizer.decode(outputs[i], skip_special_tokens=True)

#     def perform_search(self, topic_phrase):
#         """
#         INPUT:
#             topic_phrase: string
#         OUTPUT:
#             topicResults: whoosh.searching.Results

#         NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
#         """
#         topic_results = self.searcher.search(self.query_parser.parse(topic_phrase), limit=None)
#         if self.neuralrerank: #if we want to rerank the retrieved documents
#             booleansearchdocs = list(topic_results.docs()) #get the retrieved docs
#             if len(booleansearchdocs)<=1: #if 0 or 1 retrieved docs then reranking is not necessary
#               return topic_results
#             if self.expand_query_bool:
#                 topic_phrase = self.expand_query2(topic_phrase, 1)
#             query_embedding = self.query_model.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True) #embed querry
#             scores = torch.mm(query_embedding.unsqueeze(0).to(self.device), self.corpus_embeddings[booleansearchdocs].T.to(self.device)).squeeze() #compute dot product between querry embedding and document embeddings of the returned docs
#             rankings = torch.argsort(scores,descending=True) #get order of scores by index
#             return NeuralResults(booleansearchdocs,scores, rankings, self.file_list)
#         else:
#             return topic_results

In [41]:
# try gpt2 embeddings
"""
model_name = "gpt2"
q4 = IRGPT("government", model_name, False, False)
q4.add_files()
results = q4.py_trec_eval()
print(f"MAP: {q4.map(results)}")
"""
# CUDA OOM

done indexing.
Computing Embeddings


OutOfMemoryError: ignored

### Please answer the following questions here
(a) A clear list of all final modifications made.  

Filters:
I used the following filters: \
1) LowercaseFilter: convert all words to lowercase \
2) StopFilter: Remove redundant stop words like and, but, etc \
3) IntrawordFilter: break words into subwords \
4) LancasterStemmer.stem: from nltk, will stem words to root form \
5) query parser grouping: qparser.AndGroup -> qparser.OrGroup \
6) scoring function: BM25F -> PL2 (c=1.9) \
7) substitutions to deal with acronyms - surprisingly, not that useful \
8) NeuralIR -> Use neural reranking using msmarco-MiniLM-L12-cos-v5 \

(b)  Why each modification was made – how did it help? \
LowercaseFilter: convert to lowercase to ignore case when matching terms \
StopFilter: words like and, but, etc don't add meaning to document \
IntrawordFilter: handle complex searches by breaking terms into sub-words that  are more common in the document \
LancastarStemmer.stem: terms that differ only by tense or plurality will be matched \
queryparser: If a document contains all but one of the terms in a query, AndGroup will ignore it. OrGroup won't, so it will have higher recall \
Scoring Function: BM25F, TFIDF, and PL2 are three options. I simply did an empirical grid search to determine the best scoring. Results dont vary by much \
Substitutions: I replaced acronyms with full words so that they are included in matching. to my surprise, this did not help that much. \
NeuralIR: neural method outperformed sparse matrix. I used msmarco-MiniLM-L12-cos-v5

(c)  The  final  MAP  performance  that  these  modifications  attained. \
0.5355413833015759

### Q4 Validation

In [58]:
q4 = IRQ4("government")
assert(isinstance(q4.index_sys, FileIndex)), "Index Type"
assert(isinstance(q4.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q4.searcher, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
