In [1]:
MACHINE = "JAYOO_PC"
# MACHINE = "KAGGLE"

DEVICE = "GPU"

if MACHINE == "JAYOO_PC":
    ROOT = '/jayoo'
else:
    ROOT = '/'
    
SAVE = False

# OpenBook DeBERTaV3-Large with an updated model

This work is based on the great [work](https://www.kaggle.com/code/nlztrk/openbook-debertav3-large-baseline-single-model) of [nlztrk](https://www.kaggle.com/nlztrk).

I trained a model offline using the dataset I shared [here](https://www.kaggle.com/datasets/mgoksu/llm-science-exam-dataset-w-context). I just added my model to the original notebook. The model is available [here](https://www.kaggle.com/datasets/mgoksu/llm-science-run-context-2).

I also addressed the problem of [CSV Not Found at submission](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/434228) with this notebook by clipping the context like so:

`test_df["prompt"] = test_df["context"].apply(lambda x: x[:1500]) + " #### " +  test_df["prompt"]`

You can probably get more than 1500 without getting an OOM.

In [None]:
# installing offline dependencies
# !pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# !cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
# !pip install -U /kaggle/working/sentence-transformers
!pip install faiss-gpu
!pip install sentence_transformers

!pip install -U /jayoo/kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl
# !pip install --no-index --no-deps /jayoo/kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /jayoo/kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
# !pip install --no-index --no-deps /jayoo/kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
!pip install --no-index --no-deps /jayoo/kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

In [2]:
import os
import subprocess
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf
from __future__ import annotations
import pickle

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader

from IPython.display import FileLink, display



In [3]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
                    
        except:
            continue
    
    document_df = pd.DataFrame(document_sentences)
    return document_df


# fully clear memory
def clear_mem():
    gc.collect()
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    
def download_file(path, file_name):
    os.chdir('/kaggle/working/')
    zip = f"{file_name}.zip"
    command = f"zip {zip} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{file_name}.zip'))

In [5]:
SIM_MODEL = 'BAAI/bge-small-en-v1.5'
DEVICE = 0
MAX_LENGTH = 512
BATCH_SIZE = 1024

WIKI_PATH = ROOT+"/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(WIKI_PATH)

FILE_DIR = ROOT+"/kaggle/input/bge/prefix"

ngpus = faiss.get_num_gpus()
print(ngpus)

1


# Relevant Title Retrieval

In [None]:
# trn = pd.read_csv("/kaggle/input/53k-cleaned/53k_cleaned.csv").drop("id", 1)
trn = pd.read_csv(ROOT+"/kaggle/input/chris_data/54k_nota.csv") #.drop("id", 1)
trn = trn.drop("context", axis=1)
trn.head()

In [None]:
## Combine all answers
trn['answer_all'] = trn.apply(lambda x: " ".join([str(x['A']), str(x['B']), str(x['C']), str(x['D']), str(x['E'])]), axis=1)


## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

# add prefix for bge retrieval
prefix = 'Represent this sentence for searching relevant passages: '
trn['prompt_answer_stem'] = prefix + trn['prompt_answer_stem']
trn['prompt_answer_stem']

In [None]:
model = SentenceTransformer(SIM_MODEL) #,device='cuda'
model = model.cuda().half()
model.max_seq_length = MAX_LENGTH

In [None]:
# sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")
sentence_index = read_index(ROOT+"/kaggle/input/faiss-index/bge_wikiAbstract.index")

# move index to gpu
# if ngpus > 1:
    # sentence_index = faiss.index_cpu_to_all_gpus(sentence_index)
# if ngpus == 1:
#     res = faiss.StandardGpuResources()
#     sentence_index = faiss.index_cpu_to_gpu(res, 0, sentence_index)

In [None]:
prompt_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=32, device=DEVICE,
                                 show_progress_bar=True, normalize_embeddings=True) #convert_to_tensor=True
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()
# torch.cuda.empty_cache()

In [None]:
## Get the top 3 pages that are likely to contain the topic of interest
search_score, search_index = sentence_index.search(np.float32(prompt_embeddings), 6)

In [None]:
## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del prompt_embeddings
_ = gc.collect()
libc.malloc_trim(0)

In [None]:
# save

# load
# search_index = np.load('search_index.np.npy')

# Getting Sentences from the Relevant Titles

In [None]:
df = pd.read_parquet(ROOT+"/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                     columns=['id', 'file'])

In [None]:
## Get the article and associated file location using the index
wikipedia_file_data = []

for i, idx in enumerate(search_index):
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

# Save
if SAVE is True:
    wikipedia_file_data.to_csv('wiki_file_data.csv', index=False)


## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

In [None]:
## Get the full text data
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

if SAVE is True:
    # save wiki_text_data
    wiki_text_data.to_csv('wiki_text_data.csv', index=False)
    download_file(f"wiki_text_data.csv", f"wiki_text_data")

In [None]:
if SAVE is True:
    wiki_text_data = pd.read_csv('wiki_text_data.csv')

## Parse documents into sentences
processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

if SAVE is True:
    # Save for later
    processed_wiki_text_data.to_csv('processed_wiki_text_data.csv', index=False)
    download_file(f"processed_wiki_text_data.csv", f"processed_wiki_text_data")

In [None]:
# # Save
# processed_wiki_text_data.to_csv('processed_wiki_text_data.csv', index=False)
# wikipedia_file_data.to_csv('wiki_file_data.csv', index=False)

In [None]:
del wiki_text_data
_ = gc.collect()
libc.malloc_trim(0)

# Continue on GPU

In [None]:
# load
processed_wiki_text_data = pd.read_csv(FILE_DIR+'/processed_wiki_text_data.csv')

# Get first half of embeddings of wiki text data
half_index = len(processed_wiki_text_data) // 2
processed_text_half = processed_wiki_text_data['text'].iloc[half_index:].to_numpy()
processed_text_half

In [None]:
half = 2
wiki_data_embeddings2 = model.encode(processed_text_half,
                                    batch_size=BATCH_SIZE,
                                    device=DEVICE,
                                    show_progress_bar=True,
                                    # convert_to_tensor=True,
                                    normalize_embeddings=True)  #.half()
# wiki_data_embeddings1 = wiki_data_embeddings1.detach().cpu().numpy()


In [None]:
# # pickle your list of embeddings
# with open(f"wiki_data_embs{half}.pkl", "wb") as fp: 
#     pickle.dump(wiki_data_embeddings1, fp)  
# # download_file(f"wiki_data_embs{half}.pkl", f"wiki_data_embs{half}")

np.save(FILE_DIR+'/wiki_data_embs2', wiki_data_embeddings2)

In [None]:
del processed_text_half
del processed_wiki_text_data
del wiki_data_embeddings2
_ = gc.collect()
libc.malloc_trim(0)

Combine wiki_data_embeddings

# Normal

In [None]:
# ## Get embeddings of the wiki text data
# wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
#                                     batch_size=BATCH_SIZE,
#                                     device=DEVICE,
#                                     show_progress_bar=True,
# #                                     convert_to_tensor=True,
#                                     normalize_embeddings=True)#.half()
# # wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

# _ = gc.collect()

# Extracting Matching Prompt-Sentence Pairs

In [7]:
# Load saved wiki embeddings
half_embs1 = np.load(FILE_DIR+'/wiki_data_embs1.npy')
half_embs2 = np.load(FILE_DIR+'/wiki_data_embs2.npy')
wiki_data_embeddings = np.concatenate((half_embs1, half_embs2))

# save whole embeddings
np.save(FILE_DIR+'/full_wiki_embs', wiki_data_embeddings)

In [None]:
# load wiki data
wikipedia_file_data = pd.read_csv('wiki_file_data.csv')
processed_wiki_text_data = pd.read_csv('processed_wiki_text_data.csv')

In [None]:
# clear mem
del half_embs1
del half_embs2
_ = gc.collect()
libc.malloc_trim(0)

In [None]:
question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE,
                                   show_progress_bar=True, normalize_embeddings=True)  #convert_to_tensor=True
# question_embeddings = question_embeddings.detach().cpu().numpy()

In [None]:

## Parameter to determine how many relevant sentences to include
NUM_SENTENCES_INCLUDE = 22

## List containing just context
contexts = []

for r in tqdm(trn.itertuples(), total=len(trn)):

    prompt_id = r.Index

    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

    if prompt_indices.shape[0] > 0:
        prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
        prompt_index.add(np.float32(wiki_data_embeddings[prompt_indices]))
        
        prompt_index = faiss.index_cpu_to_gpu(res, 0, prompt_index)

        context = ""
        
        ## Get the top matches
        ss, ii = prompt_index.search(np.float32(question_embeddings), NUM_SENTENCES_INCLUDE)
        for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
            context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "
        
    contexts.append(context)

In [None]:
trn['context'] = contexts

In [None]:
trn

In [None]:
trn[["prompt", "A", "B", "C", "D", "E", "answer", "context", "source"]].to_csv("53k_bge_wikiAbstract.csv", index=False)

In [None]:
!ls

In [None]:
trn.loc[100]

# Inference