In [None]:
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers
!pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl

!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/d/rosekillerx/tokenizer-0-13-3/tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf
from __future__ import annotations

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader

In [None]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

In [None]:
SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
WIKI_PATH = "/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(WIKI_PATH)
DEVICE = 0
INFER = True
SUBMISSION = False

In [None]:
import ctypes
libc = ctypes.CDLL("libc.so.6")

# initial search with title and the first sentence

In [None]:
# trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv").drop("id", axis=1)
if SUBMISSION:
    trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv").drop("id", axis=1)
else:
    trn = pd.read_csv("/kaggle/input/15k-high-quality-examples/15k_gpt3.5-turbo.csv").iloc[:2000].dropna().reset_index(drop=True)
#     trn = pd.concat([trn.iloc[:400],pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv").drop("id", axis=1)])
    
trn.head()

In [None]:
if INFER:
    title_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")
    model = SentenceTransformer(SIM_MODEL, device='cuda')
    model.max_seq_length = 384
    model = model.half()
    prompt_embeddings = model.encode(trn.prompt.values, batch_size=32, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
    prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
    gc.collect()

    ss, si = title_index.search(prompt_embeddings, 5)
    del title_index
    del prompt_embeddings
    gc.collect()
    libc.malloc_trim(0)

# import details of articles selected in initial search

In [None]:
if INFER:
    index_df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                         columns=['id', 'file'])

    file_data = []
    for i,index in enumerate(si):
        temp = index_df.loc[index].copy()
        temp["prompt_id"] = i
        file_data.append(temp)
    print(file_data[0])
    file_data = pd.concat(file_data).reset_index(drop=True)
    file_data = file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

    del index_df
    gc.collect()

    text_data = []
    for file in tqdm(file_data.file.unique(),total=len(file_data.file.unique())):
        text_df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])
        query_res = pd.merge(file_data[file_data['file']==file], text_df, on='id',how='inner')
        del text_df
        gc.collect()
        text_data.append(query_res)
    text_data = pd.concat(text_data).drop(["prompt_id","file"],axis=1).drop_duplicates().reset_index(drop=True)
    gc.collect()

    libc.malloc_trim(0)

    processed_text_data = process_documents(text_data.text.values,text_data.id.values)
    print(processed_text_data.head(10))

    text_data_embeddings = model.encode(processed_text_data.text,
                                        batch_size=32,
                                        device=DEVICE,
                                        show_progress_bar=True,
                                        convert_to_tensor=True,
                                        normalize_embeddings=True)
    text_data_embeddings = text_data_embeddings.detach().cpu().numpy()

    gc.collect()

# Advanced Search

In [None]:
if INFER:
    trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)
    trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']
    question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=32, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
    question_embeddings = question_embeddings.detach().cpu().numpy()

    NUM_SENTENCES_INCLUDE = 20
    contexts = []

    for r in tqdm(trn.itertuples(), total=len(trn)):

        prompt_id = r.Index

        prompt_indices = processed_text_data[processed_text_data['document_id'].isin(file_data[file_data['prompt_id']==prompt_id]['id'].values)].index.values

        if prompt_indices.shape[0] > 0:
            prompt_index = faiss.index_factory(text_data_embeddings.shape[1], "Flat")
            prompt_index.add(text_data_embeddings[prompt_indices])

            context = ""

            ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
            for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
                context += processed_text_data.loc[prompt_indices]['text'].iloc[_i] + " "

        contexts.append(context)

In [None]:
new_trn = trn.copy()
if INFER:
    trn["context"] = contexts
    new_trn["prompt"] = trn["context"].apply(lambda x: x[:2000]) +" #### "+trn["prompt"]
if SUBMISSION:
    new_trn["answer"] = "A"
else:
    new_trn.to_csv("train_with_infer_15k.csv",index=False)

In [None]:
new_trn.info()
break

In [None]:
# from peft import PeftModel,PeftConfig
# peft_model_dir = "/kaggle/input/nlp-project-train/model_v1"
# base_model_dir = "/kaggle/input/deberta-v3-large-hf-weights"
# config = PeftConfig.from_pretrained(peft_model_dir)
# tokenizer = AutoTokenizer.from_pretrained(base_model_dir)
# model = AutoModelForMultipleChoice.from_pretrained(base_model_dir).cuda()
# model = PeftModel.from_pretrained(model, peft_model_dir)
# model.eval()
base_model_dir = "/kaggle/input/deberta-v3-large-hf-weights"
model_dir = "/kaggle/input/llm-science-run-context-2"
tokenizer = AutoTokenizer.from_pretrained(base_model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()

In [None]:
from dataclasses import dataclass

option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
new_trn = pd.read_csv("/kaggle/working/train_with_infer_15k.csv")
new_trn.head()

In [None]:
test = Dataset.from_pandas(new_trn[["prompt","A","B","C","D","E","answer"]])
tokenized_test = test.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E',"answer"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test, batch_size=1, shuffle=False, collate_fn=data_collator)

In [None]:
test_predictions = []
for batch in tqdm(test_dataloader):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_predictions.append(outputs.logits.cpu().detach())

test_predictions = torch.cat(test_predictions)
test_predictions = test_predictions.numpy()

In [None]:
prediction_letter = np.array(list('ABCDE'))[np.argsort(-test_predictions, 1)]

# Evaluation

In [None]:
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [None]:
MAP_at_3(prediction_letter, new_trn["answer"])

In [None]:
predictions_as_string = trn['prediction'] = [
    ' '.join(row) for row in prediction_letter[:, :3]
]

In [None]:
submission = trn[['prediction']].reset_index()
submission.rename(columns={'index':'id'}, inplace=True)
submission.to_csv('submission.csv', index=False)