# GPT-LangChain Implementation

## 1. Environment Setup

### 1.1. Imports

In [1]:
## Importing DS modules
import pandas as pd

## Importing OpenAI modules
import tiktoken

## Importing LangChain modules
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

## Importing other modules
import copy
import os

### 1.2 Global Variables

In [2]:
## Path for documents to be retrieved
DOCUMENT_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/data/raw_data"

## Path for test data 
TEST_LOAD_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/data/data-preparation/test.csv"

## Path to save result on test
TEST_SAVE_PATH = "/Users/kaanaydin/Library/CloudStorage/OneDrive-SharedLibraries-UniversitätSt.Gallen/STUD-NLP Group Project - General/02 Group project/evaluation"

In [3]:
## Setting up environment variables for OpenAI
os.environ["OPENAI_API_KEY"] = ''

## 2.0 Vectorstore Setup

In [4]:
## Loading documents to be retrieved
loader = PyPDFDirectoryLoader(DOCUMENT_PATH)
documents = loader.load()

In [5]:
## Choosing the tokenizer model from OpenAI
tiktoken.encoding_for_model("text-davinci-003")
tokenizer = tiktoken.get_encoding('p50k_base')

In [6]:
## Defining function to split the documents into relevant chunks

def token_length(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [7]:
## Defining criteria for splitting documents in relevant chunks to be retrieved 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=token_length,
    separators=["\n\n", "\n", " ", ""]
)

In [8]:
## Applying text splitter to our document to be retrieved
documents_splitted = text_splitter.split_documents(documents)

In [9]:
## Instantiating the embedding model from OpenAI
embeddings = OpenAIEmbeddings()

In [10]:
## Creating our vectorstore based on the documents_splitted and the OpenAI embedding
vs = Chroma.from_documents(documents_splitted, embeddings)

Using embedded DuckDB without persistence: data will be transient


## 3.0 Pipeline Setup

In [11]:
## Setting the conversation memory buffer 
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [12]:
## Instantiating GPT based on example prompt, the relevant document with the memory function 
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vs.as_retriever(), memory=memory)

In [13]:
## Function to retrieve context and answer from model
def query_model(pipeline, query):
    
    ## Retrieving the answer from model based on query
    result = pipeline({"question":query})

    ## Retrieving the answer for query
    answer = result["answer"]

    return answer

In [14]:
## Setting example query
query = "Kann ich in der Lehre unbezahlten Urlaub nehmen?"

In [15]:
## Prompting our model based on example query
query_model(qa, query)

' Grundsätzlich ist in einer Lehre ein unbezahlter Urlaub nicht vorgesehen. Sie können aber mit dem Lehrbetrieb vereinbaren, dass Sie das Lehrverhältnis nach bestandener Lehrabschlussprüfung auflösen.'

In [16]:
## Retrieving the chat history
memory.load_memory_variables({})

{'chat_history': [HumanMessage(content='Kann ich in der Lehre unbezahlten Urlaub nehmen?', additional_kwargs={}, example=False),
  AIMessage(content=' Grundsätzlich ist in einer Lehre ein unbezahlter Urlaub nicht vorgesehen. Sie können aber mit dem Lehrbetrieb vereinbaren, dass Sie das Lehrverhältnis nach bestandener Lehrabschlussprüfung auflösen.', additional_kwargs={}, example=False)]}

## 4.0 Model Evaluation (on test data)

In [17]:
## Retrieving test data
test = pd.read_csv(TEST_LOAD_PATH)

In [18]:
## Name of model currently to be evaluated
model_name = "gpt-3.5-davinci"

In [19]:
## Collecting context & answers based on test data

answers = []

for row in test.itertuples():
    
    ## Retrieving question from test
    question = str(row.Frage)
    
    ## Resetting memory for every loop
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vs.as_retriever(), memory=memory)
    
    ## Generating context and answer based on pipeline
    answer = query_model(qa, question)
    
    ## Appending answer to the list
    answers.append(answer)

## Adding answers to list 
evaluation = copy.deepcopy(test)
evaluation["ModelAntwort"] = answers

In [20]:
## Saving the testset results
evaluation.to_csv(os.path.join(TEST_SAVE_PATH, f'{model_name}_evaluation.csv'), index=False)