In [1]:
!pip install blingfire
!pip install faiss-cpu
!pip install sentence-transformers


Collecting blingfire
  Downloading blingfire-0.1.8-py3-none-any.whl (42.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: blingfire
Successfully installed blingfire-0.1.8
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[

In [53]:
import os
import gc
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange
import torch
import blingfire as bf
from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

import os

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from torch.utils.data import DataLoader

In [21]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
dimension = 384
max_length = 384
batch_size = 16
if torch.cuda.is_available():
  device = 'cuda'
else:
 device = 'cpu'

In [22]:
wikipedia_path = "/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(wikipedia_path)

In [23]:
model = SentenceTransformer(model_name,device=device)
model.max_length = 384
model = model 



In [24]:
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [25]:
row = {}
row['prompt'] = "Which of the following philosophers or scientists is associated with the view that space is absolute and exists permanently and independently of matter?"
row['A'] = 'Plato'
row['B'] = 'Aristotle'
row['C'] = 'Alhazen'
row['D'] = 'Isaac Newton'
row['E'] = 'George Berkeley'
row['id'] = 0
train = pd.DataFrame([row])

In [26]:
train.head()

Unnamed: 0,prompt,A,B,C,D,E,id
0,Which of the following philosophers or scienti...,Plato,Aristotle,Alhazen,Isaac Newton,George Berkeley,0


In [27]:
prompt_embeddings = model.encode(train.prompt.values, batch_size=batch_size, device=device, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
search_score, search_index = sentence_index.search(prompt_embeddings, 5)

In [29]:
df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", columns=['id', 'file'])

In [30]:
## Get the article and associated file location using the index
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

  0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
## Get the full text data
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{wikipedia_path}/{file}", columns=['id', 'text'])

    _df = _df[_df['id'].isin(_id)]
    wiki_text_data.append(_df)
    _ = gc.collect()
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)

  0%|          | 0/3 [00:00<?, ?it/s]

In [32]:
def process_pages(pages: Iterable[str],page_ids : Iterable , split_sentences=True,filter_length = 3):
    df = sectionize_pages(pages,page_ids)
    
    if split_sentences:
        df = sentencize(df.text.values,df.page_id,df.offset.values,filter_length)
    return df
    
def sectionize_pages(pages,page_ids):
    processed_text = []
    for page,page_id in tqdm(zip(pages,page_ids),total=len(pages)):
        row={}
        text,start,end = page,0,len(page)
        row['page_id']=page_id
        row['text']=text
        row['offset']=(start,end)
        processed_text.append(row)
    df = pd.DataFrame(processed_text)
    return df.reset_index(drop=True)

        
def  sentencize(pages : Iterable[str],page_ids : Iterable , offsets : Iterable[tuple[int,int]],filter_length : int = 3 ):
    page_sentences = []
    for page,page_id,offset in tqdm(zip(pages,page_ids,offsets),total=len(pages)):
        try:
            _,sentence_offsets = bf.text_to_sentences_and_offsets(page)
            for i in sentence_offsets:
                if i[1]-i[0] > filter_length:
                    sentence=page[i[0]:i[1]]
                    abs_offset = (i[0]+offset[0],i[1]+offset[0])
                    row={}
                    row['page_id']=page_id
                    row['offset']=abs_offset
                    row['text']=sentence
                    page_sentences.append(row)
        except:
            continue
            
    return pd.DataFrame(page_sentences)

In [33]:
processed_wiki_text_data = process_pages(wiki_text_data.text.values, wiki_text_data.id.values)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
wiki_text_embeddings = model.encode(processed_wiki_text_data.text,batch_size=batch_size,device=device,show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
wiki_text_embeddings=wiki_text_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

In [35]:
train['answers'] = train.apply(lambda x: " ".join([x['A'],x['B'],x['C'],x['D'],x['E']]),axis=1)
train['qna'] = train['prompt']+' '+train['answers']

In [36]:
questions_embeddings = model.encode(train.qna.values,batch_size=batch_size,device=device,show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True).half()
questions_embeddings = questions_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [37]:
num_sentences = 5
prompt_contexts = []
contexts = []

for r in train.itertuples():
    q_context = ""
    prompt_id = r.id
    
    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['page_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id'] == prompt_id]['id'].values)].index.values
    
    q_context = "Question: "+train.prompt.iloc[prompt_id]+'\n'
    
    q_context += '(A) ' + train.A.iloc[prompt_id] + '\n'
    q_context += '(B) ' + train.B.iloc[prompt_id] + '\n'
    q_context += '(C) ' + train.C.iloc[prompt_id] + '\n'
    q_context += '(D) ' + train.D.iloc[prompt_id] + '\n'
    q_context += '(E) ' + train.E.iloc[prompt_id] + '\n'
    
    if prompt_indices.shape[0] > 0 :
        q_context += 'Context : \n'
        prompt_index = faiss.index_factory(wiki_text_embeddings.shape[1], "Flat")
        prompt_index.add(wiki_text_embeddings[prompt_indices])
        
        context = ""
        
        ## Get the top matches
        ss, ii = prompt_index.search(questions_embeddings, num_sentences)
        
        for _s,_i in zip(ss[prompt_id],ii[prompt_id]):
            if _s < 2:
                context +=  processed_wiki_text_data.loc[prompt_indices[_i]]['text'] + '\n'
        q_context += context
    contexts.append(context)
    prompt_contexts.append(q_context)

In [38]:
train['context'] = contexts

In [46]:
train = train.drop(columns=['answers','qna'])

# inference

In [47]:
train["prompt"] = train["context"].apply(lambda x: x[:1750]) + " #### " +  train["prompt"]
train['answer'] = 'A'
train.head()

Unnamed: 0,prompt,A,B,C,D,E,id,context,answer
0,The absolute point of view was advocated in ph...,Plato,Aristotle,Alhazen,Isaac Newton,George Berkeley,0,The absolute point of view was advocated in ph...,A


In [48]:
model_dir = "/kaggle/input/llm-science-model/model_v2"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir)
model.eval()

DebertaV2ForMultipleChoice(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_aff

In [49]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [50]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [54]:
tokenized_test_dataset = Dataset.from_pandas(train[['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
#tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

  0%|          | 0/1 [00:00<?, ?ex/s]

In [59]:
test_predictions = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k]
    with torch.no_grad():
        outputs = model(**batch)
    test_predictions.append(outputs.logits.cpu().detach())

test_predictions = torch.cat(test_predictions)
test_predictions = test_predictions.numpy()

In [62]:
index_of_max = np.argmax(test_predictions)
correct = index_to_option[index_of_max]

In [63]:
correct

'D'