In [1]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import pandas as pd

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder
eval_df = pd.read_json('\dataset\eval.json')
eval_passage = pd.read_json('dataset\eval_passages.json')

# train_df.SentimentText=train_df.SentimentText.astype(str)
# print(train_df)
# print(eval_df)
train_df = eval_passage
passages = eval_passage['context']

print("Passages:", len(passages))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

Passages: 2067


Batches:   0%|          | 0/65 [00:00<?, ?it/s]

In [21]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

  0%|          | 0/2067 [00:00<?, ?it/s]

In [33]:
# This function will search all wikipedia articles for passages that
# answer the query
def search(query, k):
    # print("Input question:", query)
    ans1 = []
    ans2 = []
    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -k)[-k:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    # print(len(bm25_hits))
    bm25_passage = passages[bm25_hits[0]['corpus_id']]

    # print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:k]:
        # print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
        ans1.append(passages[hit['corpus_id']])

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    # print("\n-------------------------\n")
    # print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    # for hit in hits[0:3]:
        # print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    # print("\n-------------------------\n")
    # print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:k]:
        # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
        ans2.append(passages[hit['corpus_id']])
    cross_passage = passages[hits[0]['corpus_id']]
    # print(bm25_passage)
    # print(cross_passage)
    return ans1, ans2
    

# arr = ["Which NFL team represented the AFC at Super Bowl 50?"]
# search(arr[0], 7)

In [36]:
question = eval_df['question'].to_list()
golden_text = eval_df['context'].to_list()

correct_cross = 0
correct_bm25 = 0
bm25 = BM25Okapi(tokenized_corpus)
for q , g in  zip(question, golden_text):
    
    valbm25, cross = search(query = q, k = 5)
    if (g in valbm25):
        correct_bm25 += 1
    if (g in cross):
        correct_cross += 1

In [27]:
print(correct_cross / len(question))
print(correct_bm25)
print(correct_bm25 / len(question))

0.9310312204351939
9095
0.8604541154210028
