In [1]:
import os
import random
import heapq
import tqdm
import torch
import faiss
import pandas as pd
from rank_bm25 import BM25Okapi
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, AutoTokenizer, AutoModel
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Indexer, Searcher

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = 'mapped_summaries_l3'


In [2]:
df = pd.read_csv(f"Data/{dataset}.csv")[:100]
print(f"Size of dataset: {len(df)}, running depth 3 retrieval")
all_text = pd.read_csv("Data/all_text.csv")
index_mapping = pd.read_csv('depth_3_index_mapping.csv').set_index('df_index')['all_text_index'].to_dict()

# Define retriever classes and methods
def bm25_retriever(df):
    tokenized_corpus = [doc.split(" ") for doc in all_text["text_chunk"]]
    bm25 = BM25Okapi(tokenized_corpus)
    print("Tokenized corpus of size " + str(len(tokenized_corpus)))
    print("Starting recall@k calculation for k=1 to 10...")

    n_correct_at_k = {k: 0 for k in range(1, 11)}
    n_same_book = 0
    all_doc_scores = []

    for row in tqdm.tqdm(df["summary_sentence"], desc='Precomputing scores', unit='summary'):
        tokenized_query = row.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        all_doc_scores.append(doc_scores)

    for index, doc_scores in tqdm.tqdm(enumerate(all_doc_scores), total=len(df), desc='Recall@k', unit='row'):
        top_10_indexes = heapq.nlargest(10, range(len(doc_scores)), key=lambda i: doc_scores[i])
        book_index = index_mapping.get(index)
        if book_index is not None:
            target_book_num = all_text.iloc[book_index]["book_num"]
            matches = sum(all_text.iloc[i]["book_num"] == target_book_num for i in top_10_indexes)
            n_same_book += matches
            
            for k in range(1, 11):
                if book_index in top_10_indexes[:k]:
                    n_correct_at_k[k] += 1
        else:
            print("Not supposed to happen")
    
    average_top_10_in_same_book = n_same_book / len(df)
    print(f"On average, {average_top_10_in_same_book} of the top 10 text chunks come from the same book")
    recall_at_k = {k: n_correct_at_k[k] / len(df) for k in range(1, 11)}
    for k in range(1, 11):
        print(f"Recall at k = {k}: {recall_at_k[k]:.4f}")

Size of dataset: 100, running depth 3 retrieval


In [28]:
bm25_retriever(df)

Tokenized corpus of size 41944
Starting recall@k calculation for k=1 to 10...


Precomputing scores: 100%|██████████| 500/500 [03:08<00:00,  2.66summary/s]
Recall@k: 100%|██████████| 500/500 [00:04<00:00, 115.49row/s]


On average, 5.438 of the top 10 text chunks come from the same book
Recall at k = 1: 0.2920
Recall at k = 2: 0.3720
Recall at k = 3: 0.4120
Recall at k = 4: 0.4440
Recall at k = 5: 0.4680
Recall at k = 6: 0.4840
Recall at k = 7: 0.5000
Recall at k = 8: 0.5140
Recall at k = 9: 0.5260
Recall at k = 10: 0.5400


In [43]:
class DPRRetriever:
    def __init__(self, book_texts):
        self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        self.context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
        self.question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
        self.question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(device)

        self.book_texts = book_texts
        self.context_embeddings = self.encode_contexts(book_texts)

    def encode_contexts(self, texts):
        context_embeddings = []
        for text in tqdm.tqdm(texts, desc="Encoding Contexts"):
            context_input = self.context_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            context_embedding = self.context_encoder(**context_input).pooler_output
            context_embeddings.append(context_embedding.cpu().detach().numpy())
        context_embeddings = torch.tensor(context_embeddings).squeeze(1)
        return context_embeddings

    def retrieve_passages_index(self, claim, top_k=10):
        claim_input = self.question_tokenizer(claim, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        claim_embedding = self.question_encoder(**claim_input).pooler_output

        index = faiss.IndexFlatIP(claim_embedding.size(1))
        index.add(self.context_embeddings.cpu().detach().numpy())
        _, indices = index.search(claim_embedding.cpu().detach().numpy(), top_k)

        return indices[0].tolist()

def dpr_retriever(df):
    retriever = DPRRetriever(all_text[:500]["text_chunk"].tolist())

    n_correct_at_k = {k: 0 for k in range(1, 11)}
    n_same_book = 0

    for index, row in tqdm.tqdm(enumerate(df["summary_sentence"]), total=len(df), desc='Recall@k'):
        results = retriever.retrieve_passages_index(row, top_k=10)
        if index == 1:
            print(results)
        book_index = index_mapping.get(index)
        if book_index is not None:
            target_book_num = all_text.iloc[book_index]["book_num"]
            matches = sum(all_text.iloc[i]["book_num"] == target_book_num for i in results)
            n_same_book += matches
            
            for k in range(1, 11):
                if book_index in results[:k]:
                    n_correct_at_k[k] += 1
        else:
            print(f"Index {index} not found in mapping.")

    average_top_10_in_same_book = n_same_book / len(df)
    print(n_same_book)
    print(f"On average, {average_top_10_in_same_book} of the top 10 text chunks come from the same book")

    recall_at_k = {k: n_correct_at_k[k] / len(df) for k in range(1, 11)}
    for k in range(1, 11):
        print(f"Recall at k = {k}: {recall_at_k[k]:.4f}")

dpr_retriever(df)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the 

[397, 498, 156, 76, 128, 288, 496, 344, 431, 121]


Recall@k: 100%|██████████| 100/100 [00:02<00:00, 33.39it/s]

872
On average, 8.72 of the top 10 text chunks come from the same book
Recall at k = 1: 0.2000
Recall at k = 2: 0.2500
Recall at k = 3: 0.3200
Recall at k = 4: 0.3600
Recall at k = 5: 0.3900
Recall at k = 6: 0.4300
Recall at k = 7: 0.4700
Recall at k = 8: 0.4900
Recall at k = 9: 0.4900
Recall at k = 10: 0.5100





In [4]:
import torch
import tqdm
from transformers import AutoModel, AutoTokenizer
import faiss

df = pd.read_csv('test_data.csv')
all_text = pd.read_csv('Data/all_text.csv')
print(f"Size of dataset: {len(df)}, running depth 3 retrieval")

index_df = pd.read_csv('depth_3_index_mapping.csv')
index_test = index_df.tail(len(df))
index_test.reset_index(drop=True, inplace=True)
index_test['df_index'] = index_test.index
index_mapping = index_test.set_index('df_index')['all_text_index'].to_dict()

def contriever_retriever(df, all_text, index_mapping, device='cuda'):
    # Load the fine-tuned model and tokenizer
    model = AutoModel.from_pretrained('fine_tuned_model_batch_size=32').to(device)
    tokenizer = AutoTokenizer.from_pretrained('fine_tuned_model_batch_size=32')

    def encode_texts(texts):
        embeddings = []
        for text in tqdm.tqdm(texts, desc="Encoding Texts"):
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy())
        return torch.tensor(embeddings).squeeze(1)

    context_embeddings = encode_texts(all_text['text_chunk'].tolist()).to(device)
    query_embeddings = encode_texts(df["summary_sentence"].tolist()).to(device)

    index = faiss.IndexFlatIP(context_embeddings.size(1))
    faiss.normalize_L2(context_embeddings.cpu().numpy())
    index.add(context_embeddings.cpu().numpy())

    recall_at_k = {k: 0 for k in range(1, 11)}

    faiss.normalize_L2(query_embeddings.cpu().numpy())
    _, indices = index.search(query_embeddings.cpu().numpy(), 10)

    n_same_book = 0
    for query_idx, retrieved_indices in enumerate(indices):
        book_index = index_mapping.get(query_idx)
        if book_index is not None:
            target_book_num = all_text.iloc[book_index]["book_num"]
            matches = sum(all_text.iloc[i]["book_num"] == target_book_num for i in retrieved_indices[:10])
            n_same_book += matches
            for k in range(1, 11):
                if book_index in retrieved_indices[:k]:
                    recall_at_k[k] += 1

    average_top_10_in_same_book = n_same_book / len(df)
    print(f"On average, {average_top_10_in_same_book} of the top 10 text chunks come from the same book")
    
    recall_at_k = {k: recall_at_k[k] / len(df) for k in recall_at_k}
    for k in range(1, 11):
        print(f"Recall at k = {k}: {recall_at_k[k]:.4f}")

# Example usage:
contriever_retriever(df, all_text, index_mapping, device='cuda')



Size of dataset: 21791, running depth 3 retrieval


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  index_test['df_index'] = index_test.index
Encoding Texts:   1%|█▊                                                                                                                                       | 555/41944 [00:29<36:52, 18.71it/s]


KeyboardInterrupt: 