# BERT / GELECTRA Implementation

## 1. Environment Setup

### 1.1. Imports

In [1]:
## Importing DS modules
import pandas as pd

## Importing Torch modules
import torch

## Importing HF modules
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

## Importing LangChain modules
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import CohereEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

## Importing other modules
import copy
import os

### 1.2 Global Variables

In [26]:
## Path for documents to be retrieved
DOCUMENT_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/data/raw_data"

## Path for finetuned_model
FINETUNED_MODEL_PATH = '/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/bert/bert-finetuned'

## Path for test data 
TEST_LOAD_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/data/data-preparation/test.csv"

## Path to save result on test
TEST_SAVE_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/evaluation"

In [3]:
## Initialization of Torch device to use MPS for GPU acceleration for faster computation
DEVICE = torch.device("mps")

In [4]:
## Setting up environment variables for OpenAI
os.environ["OPENAI_API_KEY"] = ''

## 2.0 Vectorstore Setup

In [5]:
## Loading documents to be retrieved
loader = PyPDFDirectoryLoader(DOCUMENT_PATH)
documents = loader.load()

In [6]:
## Defining function to split the documents into relevant chunks

def token_length(text):
    tokens = tokenizer.encode(
        text,
    )
    return len(tokens)

In [46]:
## Defining criteria for splitting documents in relevant chunks to be retrieved 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=token_length,
    separators=["\n\n", "\n", " ", ""]
)

In [47]:
## Applying text splitter to our document to be retrieved
documents_splitted = text_splitter.split_documents(documents)

In [48]:
## Instantiating the embedding model from HuggingFace
embeddings_huggingface = HuggingFaceEmbeddings()
embeddings_openai = OpenAIEmbeddings()

In [60]:
## Creating our vectorstore based on the documents_splitted and the HuggingFace embedding
vs = Chroma.from_documents(documents_splitted, embeddings_huggingface)

Using embedded DuckDB without persistence: data will be transient


## 3.0 Pipeline Setup

In [50]:
## Instantiating finetuned model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(FINETUNED_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained("deepset/gelectra-large-germanquad")

model.to(DEVICE);

In [51]:
## Setting up pipeline for question-answering
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer, device="mps")

In [52]:
## Function to retrieve context and answer from model

def generate_context(query, vectorstore, topk):
    contexts = ""
    for i in range(topk):
        contexts += vectorstore.similarity_search(query)[i].page_content
    
    return contexts

## Function to retrieve context and answer from model

def query_model(pipeline, query, context):
    answer = pipeline({'context': context, 'question': query})
    full_answer = answer["answer"]
    return full_answer

In [53]:
## Setting example query
query = "Wie lange darf die Arbeitszeit w√§hrend der Nacht maximal sein?"

In [None]:
## Prompting our model based on example query

## Extracting the context
context = generate_context(query, vs, 4)

## Extracting the answer
answer = query_model(qa_pipeline, query, context)

## 4.0 Model Evaluation (on test data)

In [55]:
## Retrieving test data
test = pd.read_csv(TEST_LOAD_PATH)

In [56]:
## Name of model currently to be evaluated
model_name = "gelectra-finetuned"

In [57]:
## Collecting context & answers based on test data

answers = []

for row in test.itertuples():
    
    ## Retrieving question from test
    question = str(row.Frage)
    
    ## Generating context and answer based on pipeline
    context = generate_context(question, vs, 4)
    answer = query_model(qa_pipeline, question, context)
    
    ## Appending context & answer to the list
    answers.append(answer)

## Adding answers to list 
evaluation = copy.deepcopy(test)
evaluation["ModelAntwort"] = answers

In [59]:
## Saving the testset results
evaluation.to_csv(os.path.join(TEST_SAVE_PATH, f'{model_name}_evaluation.csv'), index=False)