In [6]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB

In [2]:
import wget
wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/government.zip", "government.zip")

'government.zip'

In [3]:
!unzip government.zip

Archive:  government.zip
   creating: government/
  inflating: government/topics-with-full-descriptions.txt  
  inflating: government/gov.topics   
  inflating: government/gov.qrels    
   creating: government/documents/
   creating: government/documents/61/
  inflating: government/documents/61/G00-61-2800209  
  inflating: government/documents/61/G00-61-1192048  
  inflating: government/documents/61/G00-61-1118212  
  inflating: government/documents/61/G00-61-0749882  
  inflating: government/documents/61/G00-61-2230501  
  inflating: government/documents/61/G00-61-0680698  
  inflating: government/documents/61/G00-61-0551387  
  inflating: government/documents/61/G00-61-2575433  
  inflating: government/documents/61/G00-61-0469713  
  inflating: government/documents/61/G00-61-0280746  
  inflating: government/documents/61/G00-61-2574316  
  inflating: government/documents/61/G00-61-3933997  
  inflating: government/documents/61/G00-61-3290635  
  inflating: government/documents/61/G0

In [28]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
import glob
import pandas as pd
from nltk.corpus import wordnet as wn
from nltk.stem import *
import nltk
from sentence_transformers import SentenceTransformer
import torch
import json
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher

In [8]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        # DON'T change the following definitions for topic_file, qrels_file, document_dir, file_list
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))


    def score(self,docnum,topic_results, topic_phrase):
        return topic_results.score(docnum)


    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                    topic_with_result = topic_id


        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        #fill results dictionary with queries that were returned 0 documents
        topic_ids = {t.split()[0] for t in topics}
        for emptyresult_topicid in topic_ids.difference(set(results.keys())):
            num_rel = float(sum(qrel[emptyresult_topicid].values()))
            if num_rel>0:
              topic_stats={measure:0.0 for measure in results[topic_with_result]}
            else:
              topic_stats={measure:1.0 for measure in results[topic_with_result]}
            topic_stats["num_rel"]=num_rel
            topic_stats["num_ret"] = 0.0
            topic_stats["num_rel_ret"] = 0.0
            topic_stats["num_q"]=1.0

            results[emptyresult_topicid] = topic_stats


        self.print_trec_eval_result(results)

In [9]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

# Dont change this! Use it as-is in your code if you rerank your results using a non-Whoosh scoring function
class NeuralResults():
  '''
  This class is used to rerank documents returned by whoosh in an interface that
  imitates whoosh.searching.Results (the datatype of topicResults in pyTrecEval)
  '''
  def __init__(self, booleansearchdocs,scores,rankings, file_list):
    self.results=[]
    if rankings.shape:
      for idx in rankings:
        self.results.append({'file_path':file_list[booleansearchdocs[idx]],'score':scores[idx] })

  def score(self,docnum):
    return self.results[docnum]['score']

  def __iter__(self):
    return self.results.__iter__()

## Question 4

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

**6. If you are doing neural IR you should precompute your corpus embeddings and save them in the corpus_embeddings.json file. If you do this, please keep the code used to generate the embeddings somewhere in this notebook**

Attemp 1: Query Expansion

In [15]:
def expand_query(query):
    new_query = []
    for word_pair in query:
        root, word, path = word_pair

        synonyms = set()
        antonyms = set()
        hypernyms = set()
        hyponyms = set()
        for syns in wn.synsets(word):
            hypernyms.update([i.name().split('.')[0] for i in syns.hypernyms()])
            hyponyms.update([i.name().split('.')[0] for i in syns.hyponyms()])
            for l in syns.lemmas():
                synonyms.add(l.name())
                for a in l.antonyms():
                    antonyms.add(a.name())

        for new_path, wordset in zip(('s','a','e','o'),(synonyms, antonyms, hypernyms, hyponyms)):
            for new_word in wordset:
                if new_word != word:
                    new_query.append((root, new_word, path + new_path))
    return new_query

In [16]:
def string_query(query, weights, depth):
    queries = [[(i,i,'') for i in query.split()]]
    for i in range(depth):
        queries.append(expand_query(queries[-1]))

    new_query = dict()
    ops_per_root = dict()
    for q in queries[0]:
        new_query[q[0]] = []
        ops_per_root[q[0]] = dict.fromkeys(weights.keys(),0)

    for level in queries[:3]:
        for q in level:
            for op in q[2]:
                ops_per_root[q[0]][op] += 1

    for level in queries[:3]:
        for q in level:
            weight = 1
            for op in q[2]:
                weight *= weights[op]/ops_per_root[q[0]][op]
            new_query[q[0]].append(q[1]+'^'+str(weight))
    return 'AND'.join([' OR '.join(new_query[i]) for i in new_query.keys()])

In [17]:
class IRQ4_QE(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        nltk.download('wordnet')
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter())) #| IntraWordFilter() | StopFilter() | StemFilter() | CustomFilter(WordNetLemmatizer().lemmatize)))
        self.index_sys = index.create_in(tempfile.mkdtemp(), schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        try:
            for f_path in self.file_list:
                if os.path.isfile(f_path):
                    with open(f_path, "r", encoding="utf-8") as f:
                        writer.add_document(file_path = f_path, file_content = f.read())
        finally:
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        params = {'s':0.2,'a':0,'e':0,'o':0.2}
        depth = 1
        query = self.query_parser.parse(string_query(topic_phrase, params, depth))
        topic_results = self.searcher.search(query, limit=None)
        return topic_results

In [18]:
q4_qe = IRQ4_QE("government")
q4_qe.add_files()
q4_qe.py_trec_eval()

[nltk_data] Downloading package wordnet to /root/nltk_data...


num_q                    1       1.0000
num_ret                  1       802.0000
num_rel                  1       5.0000
num_rel_ret              1       3.0000
map                      1       0.0564
gm_map                   1       -2.8756
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0667
iprec_at_recall_0.00     1       0.1200
iprec_at_recall_0.10     1       0.1200
iprec_at_recall_0.20     1       0.1200
iprec_at_recall_0.30     1       0.1200
iprec_at_recall_0.40     1       0.1200
iprec_at_recall_0.50     1       0.1200
iprec_at_recall_0.60     1       0.1200
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0667
P_20                     1       0.0500
P_30                     1       0.10

Attempt 2: Neural IR

In [19]:
#Code to get embeddings
'''model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)
files_content = []
for f_path in glob.glob("government/documents/**/*"):
    if os.path.isfile(f_path):
        with open(f_path, "r", encoding="utf-8") as f:
            file_content = f.read()
            files_content.append(file_content)

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
corpus_embeddings=model.encode(
                                    files_content,
                                    convert_to_tensor=True,
                                    show_progress_bar=True,
                                    normalize_embeddings=True
                                )
doc_ids = [str(os.path.basename(filePath)) for filePath in Path(os.path.join('government', "documents")).glob("**/*") if filePath.is_file()]
dict_corpus_embeddings=dict(zip(doc_ids,corpus_embeddings.tolist()))
json.dump(dict_corpus_embeddings,open('corpus_embeddings.json','w'))'''

'model_name = "sentence-transformers/all-mpnet-base-v2"\nmodel = SentenceTransformer(model_name)\nfiles_content = []\nfor f_path in glob.glob("government/documents/**/*"):\n    if os.path.isfile(f_path):\n        with open(f_path, "r", encoding="utf-8") as f:\n            file_content = f.read()\n            files_content.append(file_content)\n    \nmodel = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")\ncorpus_embeddings=model.encode(\n                                    files_content,\n                                    convert_to_tensor=True,\n                                    show_progress_bar=True,\n                                    normalize_embeddings=True\n                                )\ndoc_ids = [str(os.path.basename(filePath)) for filePath in Path(os.path.join(\'government\', "documents")).glob("**/*") if filePath.is_file()]\ndict_corpus_embeddings=dict(zip(doc_ids,corpus_embeddings.tolist()))\njson.dump(dict_corpus_embeddings,open(\'corpus_embeddings

In [20]:
class IRQ4_Neural(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = RegexTokenizer()))
        self.index_sys = index.create_in(tempfile.mkdtemp(), schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
        doc_ids = [str(os.path.basename(filePath)) for filePath in self.file_list]
        self.corpus_embeddings = torch.Tensor(pd.read_json('corpus_embeddings.json')[doc_ids].T.values)

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query_embedding = self.model.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True)
        scores = torch.mm(query_embedding.unsqueeze(0), self.corpus_embeddings.T).squeeze()
        all_docs = [i for i in range(len(self.corpus_embeddings))]
        rankings = torch.argsort(scores,descending=True)
        return NeuralResults(all_docs, scores, rankings, self.file_list)

In [21]:
q4_neural = IRQ4_Neural("government")
q4_neural.add_files()
q4_neural.py_trec_eval()

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

num_q                    1       1.0000
num_ret                  1       4078.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.1215
gm_map                   1       -2.1080
Rprec                    1       0.2000
bpref                    1       0.0800
recip_rank               1       0.2500
iprec_at_recall_0.00     1       0.2500
iprec_at_recall_0.10     1       0.2500
iprec_at_recall_0.20     1       0.2500
iprec_at_recall_0.30     1       0.1176
iprec_at_recall_0.40     1       0.1176
iprec_at_recall_0.50     1       0.1071
iprec_at_recall_0.60     1       0.1071
iprec_at_recall_0.70     1       0.0800
iprec_at_recall_0.80     1       0.0800
iprec_at_recall_0.90     1       0.0526
iprec_at_recall_1.00     1       0.0526
P_5                      1       0.2000
P_10                     1       0.1000
P_15                     1       0.0667
P_20                     1       0.1000
P_30                     1       0.1

Attempt 3: Weighted Average of Neural IR and Whoosh scores

In [22]:
class IRQ4_Neural_Mix(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter() | IntraWordFilter() | CustomFilter(WordNetLemmatizer().lemmatize))) #| IntraWordFilter() | StopFilter() | StemFilter() | CustomFilter(WordNetLemmatizer().lemmatize)))
        self.index_sys = index.create_in(tempfile.mkdtemp(), schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        try:
            for f_path in self.file_list:
                if os.path.isfile(f_path):
                    with open(f_path, "r", encoding="utf-8") as f:
                        file_content = f.read()
                        writer.add_document(file_path = f_path, file_content = file_content)
        finally:
            writer.close()
        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
        doc_ids = [str(os.path.basename(filePath)) for filePath in self.file_list]
        self.corpus_embeddings = torch.Tensor(pd.read_json('corpus_embeddings.json')[doc_ids].T.values)

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        alpha = 0
        beta = 1
        cutoff = 0.2
        query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(query, limit=None)

        query_embedding = self.model.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True)
        scores = torch.mm(query_embedding.unsqueeze(0), self.corpus_embeddings.T).squeeze()

        all_docs = [i for i in range(len(self.corpus_embeddings))]

        not_hit = set(all_docs)
        for doc, score in topic_results.items():
            scores[doc] = scores[doc] * (1-alpha) + score * alpha
            not_hit.remove(doc)
        for doc in not_hit:
            scores[doc] *= beta

        rankings = torch.argsort(scores,descending=True)
        cutoff = min(cutoff, max(scores).item())
        score_filter = scores >= cutoff
        filtered_scores = scores[score_filter]
        filtered_rankings = torch.argsort(filtered_scores,descending=True)
        filtered_docs = torch.argwhere(score_filter).reshape(-1).tolist()
        return NeuralResults(filtered_docs, filtered_scores, filtered_rankings, self.file_list)

In [23]:
q4_mix = IRQ4_Neural_Mix("government")
q4_mix.add_files()
q4_mix.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       462.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.1215
gm_map                   1       -2.1080
Rprec                    1       0.2000
bpref                    1       0.0800
recip_rank               1       0.2500
iprec_at_recall_0.00     1       0.2500
iprec_at_recall_0.10     1       0.2500
iprec_at_recall_0.20     1       0.2500
iprec_at_recall_0.30     1       0.1176
iprec_at_recall_0.40     1       0.1176
iprec_at_recall_0.50     1       0.1071
iprec_at_recall_0.60     1       0.1071
iprec_at_recall_0.70     1       0.0800
iprec_at_recall_0.80     1       0.0800
iprec_at_recall_0.90     1       0.0526
iprec_at_recall_1.00     1       0.0526
P_5                      1       0.2000
P_10                     1       0.1000
P_15                     1       0.0667
P_20                     1       0.1000
P_30                     1       0.10

Attempt 4: BERT-QE: Contextualized Query Expansion for Document Re-ranking (https://arxiv.org/pdf/2009.07258.pdf)

In [24]:
class IRQ4_Neural_Chunk(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = RegexTokenizer()))
        self.index_sys = index.create_in(tempfile.mkdtemp(), schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        self.files_content = []
        for f_path in self.file_list:
            if os.path.isfile(f_path):
                with open(f_path, "r", encoding="utf-8") as f:
                    file_content = f.read()
                    self.files_content.append(file_content)

        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
        doc_ids = [str(os.path.basename(filePath)) for filePath in self.file_list]
        self.corpus_embeddings = torch.Tensor(pd.read_json('corpus_embeddings.json')[doc_ids].T.values)

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        n_docs = 10
        words_per_chunk = 100
        n_chunks = 10
        alpha = 0.6
        query_embedding = self.model.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True)
        scores = torch.mm(query_embedding.unsqueeze(0), self.corpus_embeddings.T).squeeze()
        all_docs = [i for i in range(len(self.corpus_embeddings))]
        rankings = torch.argsort(scores,descending=True)

        top_docs = rankings[:n_docs]
        chunks = []
        for doc in top_docs:
            file_content = self.files_content[doc]
            words = file_content.split(' ')
            for i in range(0, len(words), 50):
                chunk = ' '.join(words[i:i+words_per_chunk])
                chunk_embedding = self.model.encode(
                                        chunk,
                                        convert_to_tensor=True,
                                        normalize_embeddings=True
                                    )
                chunk_score = torch.dot(query_embedding, chunk_embedding)
                chunks.append([chunk_score, chunk_embedding])
        top_chunks = sorted(chunks, key=lambda l:l[0], reverse=True)[:n_chunks]

        chunk_sm = torch.nn.Softmax(torch.tensor([i[0] for i in top_chunks]))

        scores = (1-alpha) * scores + alpha * torch.sum(torch.vstack([chunk_sm.dim[i] * torch.mm(top_chunks[i][1].unsqueeze(0), self.corpus_embeddings.T).squeeze() for i in range(len(top_chunks))]), 0)
        rankings = torch.argsort(scores,descending=True)

        return NeuralResults(all_docs, scores, rankings, self.file_list)

In [25]:
nc = IRQ4_Neural_Chunk('government')
nc.add_files()
nc.py_trec_eval()

KeyboardInterrupt: ignored

Final Use: Attempt 2, Neural IR

In [26]:
class IRQ4(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = RegexTokenizer()))
        self.index_sys = index.create_in(tempfile.mkdtemp(), schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
        doc_ids = [str(os.path.basename(filePath)) for filePath in self.file_list]
        self.corpus_embeddings = torch.Tensor(pd.read_json('corpus_embeddings.json')[doc_ids].T.values)

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query_embedding = self.model.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True)
        scores = torch.mm(query_embedding.unsqueeze(0), self.corpus_embeddings.T).squeeze()
        all_docs = [i for i in range(len(self.corpus_embeddings))]
        rankings = torch.argsort(scores,descending=True)
        return NeuralResults(all_docs, scores, rankings, self.file_list)

### Please answer the following questions here
(a) A clear list of all final modifications made.

First, I tried doing query expansion using wordnet synonyms, antonyms, hypernyms and hyponyms. Then, I used Neural IR using all-mpnet-base-v2. Then, I attempted to mix Neural IR with regular Whoosh scores as a mixed-bag type model. Finally, I implemented the QE-BERT model, which uses embeddings to perform a form of query expansion.

(b)  Why each modification was made – how did it help?

The query expansion made sense on paper - by adding new words to the query that were related, one can capture more documents with similar meaning that might've missed the exact word. However, without further grammatical analysis of the query, there are too many expansions for each word's meaning, and thus adding the synonyms, etc. didn't improve performance over baseline Whoosh. My methodology might also have been wrong.

The Neural IR also tackles this challenge, it encodes the meaning of the query rather than just the words. This improved performance for the same reasons I thought query expansion would; it captured documents that might've missed the exact query wording.

Sometimes, mixing the scores of two models helps reduce error from either one, and improve overall accuracy. I tried this, however hyperparameter tuning showed that the best option was to use the original Neural IR.

Finally, QE-BERT uses relevant sections from the documents themselves to query the documents, which is a form of query expansion. This improves performance, but it takes too long on Colab and therefore I can't risk it.

(c)  The  final  MAP  performance  that  these  modifications  attained.

0.5043

### Q4 Validation

In [29]:
q4 = IRQ4("government")
assert(isinstance(q4.index_sys, FileIndex)), "Index Type"
assert(isinstance(q4.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q4.searcher, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
