In [None]:
!pip install faiss-cpu
!pip install sentence-transformers
!pip install blingfire

# Imports

In [None]:
import os
import gc
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange
import torch
import blingfire as bf
from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

# Model

This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
Given an input text, it outputs a vector which captures the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.

By default, input text longer than 256 word pieces is truncated.

In [None]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
dimension = 384
max_length = 384
batch_size = 16
if torch.cuda.is_available():
  device = 'cuda'
else:
 device = 'cpu'

# Load Data

In [None]:
wikipedia_path = "/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(wikipedia_path) # list of all files names

In [None]:
train = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv")

In [None]:
train.head()

In [None]:
model = SentenceTransformer(model_name,device=device)
model.max_length = 384
model = model.half() # change datatype to fp16

In [None]:
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [None]:
prompt_embeddings = model.encode(train.prompt.values, batch_size=batch_size, device=device, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()

extracting all promts form train and encoding it. which will help us in semantic search to get most relevant wikipedia article.

In [None]:
_ = gc.collect()

In [None]:
search_score, search_index = sentence_index.search(prompt_embeddings, 5)

In [None]:
del sentence_index
del prompt_embeddings
_ = gc.collect()

In [None]:
df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", columns=['id', 'file'])
df.head()

this contain index of every wikipedia article

In [None]:
## Get the article and associated file location using the index
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()

what we are doing here 
* we are each search_score,search_index we are finding to which article_id and file that search_indexs belong
* then storing those information along with promt id

In [None]:
wikipedia_file_data.head()

In [None]:
## Get the full text data
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{wikipedia_path}/{file}", columns=['id', 'text'])

    _df = _df[_df['id'].isin(_id)]
    wiki_text_data.append(_df)
    _ = gc.collect()
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

* we are opening wikipedia file associated with each promt
* oppening unique article id and storing it along it's prompt id

In [None]:
wiki_text_data.head()

# Splitting Articles


We split the Wikipedia documents into sentences.We want to retrieve the most similar sentences to provide context.

In [None]:
def process_pages(pages: Iterable[str],page_ids : Iterable , split_sentences=True,filter_length = 3):
    df = sectionize_pages(pages,page_ids)
    
    if split_sentences:
        df = sentencize(df.text.values,df.page_id,df.offset.values,filter_length)
    return df
    
def sectionize_pages(pages,page_ids):
    processed_text = []
    for page,page_id in tqdm(zip(pages,page_ids),total=len(pages)):
        row={}
        text,start,end = page,0,len(page)
        row['page_id']=page_id
        row['text']=text
        row['offset']=(start,end)
        processed_text.append(row)
    df = pd.DataFrame(processed_text)
    return df.reset_index(drop=True)

        
def  sentencize(pages : Iterable[str],page_ids : Iterable , offsets : Iterable[tuple[int,int]],filter_length : int = 3 ):
    page_sentences = []
    for page,page_id,offset in tqdm(zip(pages,page_ids,offsets),total=len(pages)):
        try:
            _,sentence_offsets = bf.text_to_sentences_and_offsets(page)
            for i in sentence_offsets:
                if i[1]-i[0] > filter_length:
                    sentence=page[i[0]:i[1]]
                    abs_offset = (i[0]+offset[0],i[1]+offset[0])
                    row={}
                    row['page_id']=page_id
                    row['offset']=abs_offset
                    row['text']=sentence
                    page_sentences.append(row)
        except:
            continue
            
    return pd.DataFrame(page_sentences)

In [None]:
processed_wiki_text_data = process_pages(wiki_text_data.text.values, wiki_text_data.id.values)

In [None]:
processed_wiki_text_data.head()

In [None]:
wiki_text_embeddings = model.encode(processed_wiki_text_data.text,batch_size=batch_size,device=device,show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
wiki_text_embeddings=wiki_text_embeddings.detach().cpu().numpy()

In [None]:
_ = gc.collect()

combining prompt and options for better semantic search

In [None]:
train['answers'] = train.apply(lambda x: " ".join([x['A'],x['B'],x['C'],x['D'],x['E']]),axis=1)
train['qna'] = train['prompt']+' '+train['answers']

In [None]:
questions_embeddings = model.encode(train.qna.values,batch_size=batch_size,device=device,show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
questions_embeddings = questions_embeddings.detach().cpu().numpy()

In [None]:
train.head()

In [None]:
wikipedia_file_data.head()

In [None]:
processed_wiki_text_data.head()

In [None]:
num_sentences = 5
prompt_contexts = []
contexts = []

for r in train.itertuples():
    q_context = ""
    prompt_id = r.id
    
    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['page_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id'] == prompt_id]['id'].values)].index.values
    
    q_context = "Question: "+train.prompt.iloc[prompt_id]+'\n'
    
    q_context += '(A) ' + train.A.iloc[prompt_id] + '\n'
    q_context += '(B) ' + train.B.iloc[prompt_id] + '\n'
    q_context += '(C) ' + train.C.iloc[prompt_id] + '\n'
    q_context += '(D) ' + train.D.iloc[prompt_id] + '\n'
    q_context += '(E) ' + train.E.iloc[prompt_id] + '\n'
    
    if prompt_indices.shape[0] > 0 :
        q_context += 'Context : \n'
        prompt_index = faiss.index_factory(wiki_text_embeddings.shape[1], "Flat")
        prompt_index.add(wiki_text_embeddings[prompt_indices])
        
        context = ""
        
        ## Get the top matches
        ss, ii = prompt_index.search(questions_embeddings, num_sentences)
        
        for _s,_i in zip(ss[prompt_id],ii[prompt_id]):
            if _s < 2:
                context +=  processed_wiki_text_data.loc[prompt_indices[_i]]['text'] + '\n'
        q_context += context
    contexts.append(context)
    prompt_contexts.append(q_context)
    
                
    

The threshold value of 2 in the condition if _s < 2 is used as a heuristic to filter out less relevant results. This value is likely chosen based on the specific characteristics of the embeddings and the similarity measure being used. In FAISS, the scores represent distances (lower is better), so a threshold of 2 means that only matches with a distance less than 2 are considered relevant.

In [None]:
train['context'] = contexts

In [None]:
train.to_csv("./train_context.csv", index=False)

In [None]:
for i in range(0,1):
    print(prompt_contexts[i])