# Neural IR

In [None]:
!pip install transformers
!pip install sentence_transformers
!pip install whoosh
!pip install pytrec_eval
!pip install wget

In [None]:
import os.path
import wget

In [None]:
filename = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/lab-data.zip", "lab-data.zip")

In [None]:
!unzip lab-data.zip

## Creating Embeddings

We will use the pre-trained model TAS-B trained on the MSMARCO dataset. For more information see the following links:



*   TAS-B paper: https://arxiv.org/abs/2104.06967
*   Pre-trained models on MSMARCO: https://www.sbert.net/docs/pretrained_models.html#msmarco-passage-models



In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
# create TAS-B (fine tuned BERT) encoder
model_name = "sentence-transformers/msmarco-distilbert-base-tas-b"
TasB = SentenceTransformer(model_name)

In [None]:
# embedding a phrase
doc_a = 'I love IR'

doc_a_embedding = TasB.encode(
            doc_a,
            convert_to_tensor=True,
            normalize_embeddings=False
        )
doc_a_embedding.shape

In [None]:
# embedding a list of documents
doc_b = "I hate IR"
doc_c = "IR is alright"
corpus = [doc_a, doc_b, doc_c]

corpus_embeddings = TasB.encode(
            corpus,
            convert_to_tensor=True,
            show_progress_bar=True,
            normalize_embeddings=False
        )

In [None]:
corpus_embeddings.shape

## Computing similarity with embeddings

In [None]:
# embedding a query and searching the corpus
q = "I'm really excited about this IR assignment!"
q_embedding = TasB.encode(
            q,
            convert_to_tensor=True,
            normalize_embeddings=False
        )
q_embedding.shape

In [None]:
# make q_embedding 2-dimensional for matrix multiplication
q_embedding = q_embedding.unsqueeze(0)
q_embedding.shape

In [None]:
# transpose the corpus embeddings
corpus_embeddings_transposed = corpus_embeddings.T

# compute dot products between query embedding and transposed corpus embeddings
dot_products = torch.mm(q_embedding, corpus_embeddings_transposed)

print(dot_products)
print(dot_products.shape)

In [None]:
# remove the singleton dimension
dot_products = dot_products.squeeze()
print(dot_products.shape)

In [None]:
# rank the documents based on their dot product scores with the query
ranked_doc_indices = torch.argsort(dot_products, descending=True)

# Display ranked documents
print("Query:", q)
print("\nTop documents:")

for i in ranked_doc_indices:
    print(corpus[i], "(Score:", dot_products[i].item(), ")")

## Storing and reading document embeddings

In [None]:
import json

In [None]:
doc_ids = ['A','B','C']

In [None]:
# Create a dictionary with doc_ids as keys and embeddings as values
# we convert our tensors to lists so that we can save to a json file
dict_corpus_embeddings=dict(zip(doc_ids,corpus_embeddings.tolist()))

In [None]:
#save embeddings to json
json.dump(dict_corpus_embeddings,open('corpus_embeddings.json','w'))

In [None]:
#read embeddings from json
dict_corpus_embeddings_2 = json.load(open('corpus_embeddings.json','r'))
type(dict_corpus_embeddings_2)

In [None]:
#convert dict of embeddings to pytorch tensor
corpus_embeddings_2 = torch.Tensor([dict_corpus_embeddings_2[doc] for doc in doc_ids])

In [None]:
#check to make sure saving and reading embeddings using json did not change them
(corpus_embeddings_2==corpus_embeddings).all()

In [None]:
#leverage pandas vectorization to read from json
import pandas as pd
corpus_embeddings_3 = torch.Tensor(pd.read_json('corpus_embeddings.json')[doc_ids].T.values)
(corpus_embeddings_3==corpus_embeddings).all()

## Using TREC_EVAL with neural IR

In [None]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
from whoosh import qparser

In [None]:
class NeuralResults():
  '''
  This class is used to rerank documents returned by whoosh in an interface that
  imitates whoosh.searching.Results (the datatype of topicResults in pyTrecEval)
  '''
  def __init__(self, booleansearchdocs,scores,rankings, file_list):
    self.results=[]
    for idx in rankings:
      self.results.append({'file_path':file_list[booleansearchdocs[idx]],'score':scores[idx] })

  def score(self,docnum):
    return self.results[docnum]['score']

  def __iter__(self):
    return self.results.__iter__()

class IRSystem():

    def __init__(self, data_dir="lab-data"):
        self.topic_file = os.path.join(data_dir, "air.topics")
        self.qrels_file = os.path.join(data_dir, "air.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.create_parser_searcher()

        self.neuralrerank=True

    def set_neural_rerank(self,val):
        self.neuralrerank=val

    def create_index(self):
        analyzer =RegexTokenizer() | LowercaseFilter()  |  StopFilter() | StemFilter()
        schema = Schema(file_path = ID(stored=True), file_content = TEXT(analyzer = analyzer))
        indexDir = tempfile.mkdtemp()
        self.index_sys = index.create_in(indexDir, schema)


    def add_files(self):
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        file_contents=[]
        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath, file_content = fileContent)
                    file_contents.append(fileContent)
                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()
        print("Computing Embeddings")
        self.TasB = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-tas-b")
        self.corpus_embeddings=self.TasB.encode(
                                          file_contents,
                                          convert_to_tensor=True,
                                          show_progress_bar=True,
                                          normalize_embeddings=True
                                      )

    def create_parser_searcher(self):
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
        self.searcher = self.index_sys.searcher()



    def perform_search(self, topic_phrase):
        topicResults = self.searcher.search(self.query_parser.parse(topic_phrase), limit=None) #regular whoosh search
        if self.neuralrerank: #if we want to rerank the retrieved documents
          booleansearchdocs = list(topicResults.docs()) #get the retrieved docs
          if len(booleansearchdocs)<=1: #if 0 or 1 retrieved docs then reranking is not necessary
            return topicResults
          query_embedding = self.TasB.encode(topic_phrase,convert_to_tensor=True, normalize_embeddings=True) #embed querry
          scores = torch.mm(query_embedding.unsqueeze(0), self.corpus_embeddings[booleansearchdocs].T).squeeze() #compute dot product between querry embedding and document embeddings of the returned docs
          rankings = torch.argsort(scores,descending=True) #get order of scores by index
          return NeuralResults(booleansearchdocs,scores, rankings, self.file_list)
        else:
          return topicResults

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))


    def score(self,docnum,topic_results):
        return topic_results.score(docnum)


    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                print(topic_id)
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                    topic_with_result = topic_id


        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        #fill results dictionary with queries that returned 0 documents
        topic_ids = {t.split()[0] for t in topics}
        for emptyresult_topicid in topic_ids.difference(set(results.keys())):
            num_rel = float(sum(qrel[emptyresult_topicid].values()))
            if num_rel>0:
              topic_stats={measure:0.0 for measure in results[topic_with_result]}
            else:
              topic_stats={measure:1.0 for measure in results[topic_with_result]}
            topic_stats["num_rel"]=num_rel
            topic_stats["num_ret"] = 0.0
            topic_stats["num_rel_ret"] = 0.0
            topic_stats["num_q"]=1.0

            results[emptyresult_topicid] = topic_stats


        self.print_trec_eval_result(results)


In [None]:
irsystem = IRSystem("lab-data")
irsystem.add_files()

In [None]:

irsystem.set_neural_rerank(True)
irsystem.py_trec_eval()

In [None]:
irsystem.set_neural_rerank(False)
irsystem.py_trec_eval()

On this dataset, reranking using TAS-B decreased our MAP from 0.75 to 0.68.

Here are some resources on how to achieve better performance with Neural IR:


*   [Pretrained Transformers for Text Ranking:BERT and Beyond](https://arxiv.org/abs/2010.06467)
*   [SPLADE methodology for contextual query reformulation/expansion](https://www.pinecone.io/learn/splade/)



In [None]:
#example code to create embeddings in batches
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, data_dir):
        self.document_dir = os.path.join(data_dir, "documents")
        self.data = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return open(self.data[idx], "r", encoding="utf-8").read()

from torch.utils.data import DataLoader
data_loader = DataLoader(CustomDataset("lab-data"), batch_size=128, shuffle=False)
embeddings=[]
for data in data_loader:
  e=TasB.encode(
                                                        data,
                                                        convert_to_tensor=True,
                                                    )
  embeddings.append(e)
embeddings = torch.cat(embeddings, axis=0)