In [None]:
%pip install transformers==4.35.0 --quiet
%pip install xformer==1.0.1 --quiet
%pip install sentence-transformers==2.2.2 --quiet
%pip install openai==0.28.1 --quiet 
%pip install langchain==0.0.320 --quiet
%pip install chromadb==0.4.14 --quiet
%pip install tiktoken==0.5.1--quiet

### Paraméterek

In [None]:
# használt erőforrások mappája
res_folder = "res/in_use/"
    
# splitting paraméterek
chunk_size = 500
chunk_overlap = 50

# vector store paraméterek
persist_directory = "res/chroma/"
search_type = "mmr"
search_k = 5
search_fetch_k = 8
lambda_mult = 0.6

# memória
memory_k = 3

# ChatGPT paraméterek
temperature = 0.4
max_tokens = 500
model_id = "gpt-3.5-turbo"

## Erőforrásfájlok betöltése

Ha már egyszer megtettük és van mentett vektor adatbázis, akkor nem kell újra futtatni.
### CSV fájlok (nagyrészt kérdések) betöltése

In [None]:
from langchain.document_loaders import CSVLoader, DirectoryLoader

directory_loader = DirectoryLoader(res_folder, glob="*.csv", use_multithreading=True, loader_cls=CSVLoader, loader_kwargs={"encoding": "utf-8"})
csv_data = directory_loader.load()
print(len(csv_data))
#csv_data


### Szöveges fájlok betöltése, majd feldarabolása

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader

directory_loader = DirectoryLoader(res_folder, glob="*.txt", use_multithreading=True, loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
text_data = directory_loader.load()
len(text_data)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=True,
    separators=["\n\s*\n", "\n\s*", "\n"]
)

split_text_data = text_splitter.split_documents(text_data)
print(len(split_text_data))
#split_text_data

### VectorStore inicializálása Chroma-val

In [None]:
# OpenAI API Key beállítása
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings

# Embedding betöltése
instr_embeddings = HuggingFaceInstructEmbeddings( model_name="hkunlp/instructor-base", model_kwargs = {"device" : "cpu"}, query_instruction="Represent the query for retrieval: ")
embedding = OpenAIEmbeddings()

In [None]:
# Elég egyszer futtatni, ha nem változtatunk az adatokon, mert lementi a vektoradatbázist.
# Változás esetén törölni kell a res/chroma maappát, majd újra kell futtatni.
combined_data = []
combined_data.extend(split_text_data)
combined_data.extend(csv_data)

In [None]:
# Instructoros adatbázis
instr_vectordb = Chroma.from_documents(
    documents=combined_data,
    embedding=instr_embeddings
)

In [None]:
#OpeanAI adatbázis
vectordb = Chroma.from_documents(
    documents=combined_data,
    embedding=embedding,
    persist_directory=persist_directory
)

vectordb.persist()

Ha le van már mentve, ezt kell használni az előző cella helyett:

In [None]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
# MMR teszt
question = "Ki a tárgyfelelős?"
search_result = vectordb.max_marginal_relevance_search(question,k = search_k, fetch_k = search_fetch_k, lambda_mult = lambda_mult)
print(search_result)
print(vectordb.similarity_search_with_relevance_scores(question))

### Memória config a chat historyhoz

In [None]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(k = memory_k, memory_key="chat_history", return_messages=True)

## Chatbot

### Saját prompttal

Előzmények alappján konkrét, önálló kérdést generálunk:

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name= model_id, temperature = temperature, max_tokens = max_tokens)

template = """A chat előzményekből és egy következő kérdésből álló input alapján alakítsd át a következő kérdést akkor, ha a kérdés teljes értelmezéséhez szükséges a korábbi kontextus is, úgy, hogy az új kérdés értelmezhető legyen magában is.
Nem fogalmmazz új kérdést bele a korábbi kontextus alapján, csak alakítsd át a kérdést, ha szükséges. Ha nem kapcsolódik szakmai gyakorlathoz a beszélgetés, akkor írd be, hogy "Erre sajnos nem tudsz válaszolni".
Chat előzmények:
    {chat_history}

Eredeti kérdés: {question}
Új kérdés:"""
question_generator_prompt = PromptTemplate.from_template(template)
question_generator_chain = LLMChain(
    llm=llm,
    prompt=question_generator_prompt,
    #verbose=True
)


huBERT pipeline létrehozása és tesztelése:

In [None]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import pipeline

model_name = "mcsabai/huBert-fine-tuned-hungarian-squadv2"
pipe = pipeline(task="question-answering", model=model_name, tokenizer=model_name, top_k=1, handle_impossible_answer=False)
qa_llm = HuggingFacePipeline(pipeline=pipe)

pipe({"question": "Ki a tárgyfelelős?", "context": "A tárgyfelelős Blázovics László."})

Kérdést megválaszoló függvény:

In [None]:
memory.clear()

def qa_answer(question) -> str:
    if memory.buffer_as_str:
        question = question_generator_chain({"question": question, "chat_history": memory.buffer})["text"]
        print(f'Új kérdés: {question}')
    docs = instr_vectordb.max_marginal_relevance_search(question,k = search_k, fetch_k = search_fetch_k, lambda_mult = lambda_mult)
    print(f'Keresési eredmények: {docs}')
    all_content = ""
    for doc in docs:
        all_content += doc.page_content + "\n\n"
    answer = pipe({"question": question, "context": all_content})["answer"]
    print(f'Válasz: {answer}')
    if answer == "":
        answer = "Nem tudom a választ erre."
    memory.save_context({"input": question}, {"output": answer})
    return answer

## Gradio UI a chatbothoz

In [None]:
%pip install gradio==3.47.1 --quiet

In [None]:
def qa(message, history) -> str:
    return qa_answer(message)

In [None]:
import gradio as gr

chat_ui = gr.ChatInterface(qa, title = "Lacibot", description="Kérdezz a VIK-es szakmai gyakorlatról!", undo_btn=None)
chat_ui.launch()

## Tesztelés

#### Teszt kérdések betöltése

In [None]:
%pip install pandas==2.1.1 --quiet
%pip install matplotlib==3.8.0 --quiet

In [None]:
import pandas as pd

test_questions = pd.read_csv("testing/test_questions.csv")
print(len(test_questions))
#test_questions

Instructor vectoradatbázis tesztelése:

In [None]:
import pandas as pd
df = pd.DataFrame(columns=['Question', 'Document', 'Score'])

for question in test_questions:
    search_result = instr_vectordb.similarity_search_with_relevance_scores(question, k = search_k)
    for res in search_result:
        df.loc[len(df)] = [question, res[0], res[1]]

df

In [None]:
# elmentés
df.to_csv(f"testing/results_k{search_k}_size{chunk_size}.csv", index=False)

#### Gráf

In [None]:
import matplotlib.pyplot as plt

grouped = df.groupby('Question')

fig, ax = plt.subplots()

# Iterate over each group
for i, (name, group) in enumerate(grouped, start=1):
    group = group.reset_index(drop=True)
    ax.plot(group.index+1, group['Score'], label=f"Group {i}")

ax.legend(fontsize=8, loc='upper right')
plt.xlabel('Index')
plt.xlim(0.75, search_k+0.25)
plt.xticks(range(1, search_k+1))

plt.ylabel('Score')
plt.ylim(min(df['Score']) - 0.005, max(df['Score']) + 0.005)

plt.title('Score for Each Question')
plt.show()

### Full tesztek

In [None]:
import pandas as pd
from langchain.evaluation.qa import QAEvalChain

llm = ChatOpenAI(temperature=0, model=model_id)
eval_chain = QAEvalChain.from_llm(llm)


Kérdések egyenként

In [None]:
df = pd.DataFrame(columns=['Question', 'Human Answer', 'AI Answer', 'AI Evaluation'])
for i, question in enumerate(test_questions['Question']):
    memory.clear() # reset memory, hogy ne legyen hatása a következő kérdésre
    ai_ans = qa_answer(question)
    eval_result = eval_chain({"query": question, "result": ai_ans, "answer": test_questions['Answer'][i]})
    df.loc[len(df)] = [question, test_questions['Answer'][i], ai_ans, eval_result['results']]
    
df

In [None]:
# elmentés
df.to_csv(f"testing/models/hubert_INSTRUCTOR_k{search_k}_size{chunk_size}.csv", index=False)

Összefüggő beszélgetés

In [None]:
test_chat = pd.read_csv("testing/test_chat.csv")
print(len(test_chat))
#test_chat

In [None]:
memory.clear()
df = pd.DataFrame(columns=['Question', 'Human Answer', 'AI Answer', 'AI Evaluation'])
for i, question in enumerate(test_chat['Question']):
    ai_ans = qa_answer(question)
    eval_result = eval_chain({"query": question, "result": ai_ans, "answer": test_chat['Answer'][i]})
    df.loc[len(df)] = [question, test_chat['Answer'][i], ai_ans, eval_result['results']]

df

In [None]:
# elmentés
df.to_csv(f"testing/models/hubert_INSTRUCTOR_CHAT_k{search_k}_size{chunk_size}.csv", index=False)

### Rouge score

In [None]:
%pip install rouge-score==0.1.2 --quiet

In [None]:
from typing import List
import pandas as pd
from rouge_score import rouge_scorer

def calculate_rouge(df, rouge_types: List[str]):
    scores = []
    scorer = rouge_scorer.RougeScorer(rouge_types, use_stemmer=True)
    for index, row in df.iterrows():
        #Mindegyik sorra az rouge score-t kiszámoljuk, majd a listába tesszük
        score = scorer.score(row["Human Answer"], row["AI Answer"])
        scores.append(score)
    return scores

In [None]:
df = pd.read_csv("testing/models/hubert_k5_size500.csv")
chat_df = pd.read_csv("testing/models/hubert_CHAT_k5_size500.csv")

instr_df = pd.read_csv("testing/models/hubert_INSTRUCTOR_k5_size500.csv")
instr_chat_df = pd.read_csv("testing/models/hubert_INSTRUCTOR_CHAT_k5_size500.csv")

In [None]:
rouge2_scores = calculate_rouge(df, ["rouge2"])
rougeL_scores = calculate_rouge(df, ["rougeL"])
df = df.assign(ROUGE2=rouge2_scores, ROUGEL=rougeL_scores)
df.to_csv("testing/models/hubert_k5_size500.csv", index=False)

rouge2_chat_scores = calculate_rouge(chat_df, ["rouge2"])
rougeL_chat_scores = calculate_rouge(chat_df, ["rougeL"])
chat_df = chat_df.assign(ROUGE2=rouge2_chat_scores, ROUGEL=rougeL_chat_scores)
chat_df.to_csv("testing/models/hubert_CHAT_k5_size500.csv", index=False)

all_rouge2_scores = rouge2_scores + rouge2_chat_scores
all_rougeL_scores = rougeL_scores + rougeL_chat_scores

rouge2_instr_scores = calculate_rouge(instr_df, ["rouge2"])
rougeL_instr_scores = calculate_rouge(instr_df, ["rougeL"])
instr_df = instr_df.assign(ROUGE2=rouge2_instr_scores, ROUGEL=rougeL_instr_scores)
instr_df.to_csv("testing/models/hubert_INSTRUCTOR_k5_size500.csv", index=False)

rouge2_instr_chat_scores = calculate_rouge(instr_chat_df, ["rouge2"])
rougeL_instr_chat_scores = calculate_rouge(instr_chat_df, ["rougeL"])
instr_chat_df = instr_chat_df.assign(ROUGE2=rouge2_instr_chat_scores, ROUGEL=rougeL_instr_chat_scores)
instr_chat_df.to_csv("testing/models/hubert_INSTRUCTOR_CHAT_k5_size500.csv", index=False)

all_rouge2_instr_scores = rouge2_instr_scores + rouge2_instr_chat_scores
all_rougeL_instr_scores = rougeL_instr_scores + rougeL_instr_chat_scores

In [None]:
import os

def calculate_average(scores, rouge_type: str, version: str = ""):
    precisions = [score[rouge_type][0] for score in scores]
    p_avg = sum(precisions) / len(precisions)

    recalls = [score[rouge_type][1] for score in scores]
    r_avg = sum(recalls) / len(recalls)
    
    fmeasures = [score[rouge_type][2] for score in scores]
    f_avg = sum(fmeasures) / len(fmeasures)

    directory = "testing/rouge/"
    if not os.path.exists(directory):
        os.makedirs(directory)

    file_name = f"{directory}average_hubert{version}_{rouge_type}.txt"

    with open(file_name, 'w') as f:
        f.write(f"Recall: {r_avg}\n")
        f.write(f"Precision: {p_avg}\n")
        f.write(f"F-measure: {f_avg}\n")


In [None]:


calculate_average(all_rouge2_scores, "rouge2")
calculate_average(all_rougeL_scores, "rougeL")

calculate_average(all_rouge2_instr_scores, "rouge2", "_INSTRUCTOR")
calculate_average(all_rougeL_instr_scores, "rougeL", "_INSTRUCTOR")