In [1]:

from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import ast  
import openai  
import pandas as pd  
import tiktoken 
from scipy import spatial 
import numpy as np


In [2]:

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

openai.api_key = "##############"
df = pd.read_csv("informatika_data.csv")
final_table=pd.DataFrame()
final_table['text']=df['text']

# Assuming you have a DataFrame called 'final_table' with a column 'text'
def get_openai_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",  # or use your model
        input=text
    )
    print(response)  # Print the response to see its structure
    return response['data'][0]['embedding']

# Apply the function to each row in the 'text' column and create a new column 'embedding'
final_table['embedding'] = final_table['text'].apply(get_openai_embedding)
# Display the DataFrame with the new 'embedding' column
print(final_table)


def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]


def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Si skolsky asistent na otazku skus odpovedat vedomostami ktore ziskat v contexte. maximalna odpoved 150 tokenov.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question




{
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        -0.005962275434285402,
        -0.01009698398411274,
        0.013389197178184986,
        -0.031366754323244095,
        -0.05148295313119888,
        0.0034283085260540247,
        -0.02522301860153675,
        -0.013071641325950623,
        -0.0347885824739933,
        -0.025780361145734787,
        0.004679090343415737,
        -0.008237013593316078,
        -0.008774914778769016,
        -0.0013812065590173006,
        -0.018547862768173218,
        0.01510011125355959,
        0.03294805437326431,
        -0.012248587794601917,
        -0.004811945371329784,
        -0.0021273011807352304,
        -0.0038916810881346464,
        0.008826760575175285,
        -0.018923744559288025,
        0.0044198608957231045,
        -0.021360499784350395,
        0.005310961976647377,
        0.0020543928258121014,
        -0.019299626350402832,
        -0.014218730852007866,
   

In [3]:
def ask(
    query: str,
    df: pd.DataFrame = final_table,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "Si skolsky asistent na otazku skus odpovedat vedomostami ktore ziskat v contexte. maximalna odpoved 150 tokenov"},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        max_tokens=256,
        messages=messages,
        temperature=0.1
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [4]:
import openai
api_key =("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")
openai.api_key = api_key

from langchain.document_loaders.unstructured import UnstructuredFileLoader 
loader = UnstructuredFileLoader('testivacie.txt')
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key="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"))
retriever = vectorstore.as_retriever()

prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1,max_tokens=256,openai_api_key="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")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [6]:
df = pd.read_csv("final_dataset_q&a.csv")


def get_openai_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )

    return response['data'][0]['embedding']


df['embedding'] = df['questions'].apply(get_openai_embedding)

In [7]:
def calculate_similarity(embedding1, embedding2):
    embedding1 = np.array(embedding1, dtype=np.float32)
    embedding2 = np.array(embedding2, dtype=np.float32)

    # Calculate cosine similarity between two embeddings
    dot_product = np.dot(embedding1, embedding2)
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)

    # Check for zero division to avoid runtime warnings
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    similarity = dot_product / (norm1 * norm2)
    return similarity

In [9]:
text="Ake programy sa nachadaju na fakulte elektrotechniky a informatiky"
def custombased(text):
    response = openai.Embedding.create(
            model="text-embedding-ada-002",  
            input=text
        )
    question_embedding=response['data'][0]['embedding']


    best_match = None
    best_similarity = 0.0

    for index, row in df.iterrows():
            entry_embedding = row['embedding']   
        
            similarity = calculate_similarity(question_embedding, entry_embedding)


            if similarity > best_similarity:
                best_similarity = similarity
                best_match=row['answers']
                

    conversation = [
            {"role": "system", "content": "Si skolsky asistent na otazku skus odpovedat vedomostami ktore ziskat v contexte. maximalna odpoved 150 tokenov"},
            {"role": "user", "content": f"Question: {text}"},
            {"role": "assistant", "content": f"Context: {best_match}"}
        ]


    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=conversation,
    max_tokens=256,
    temperature=0.1
    )


    answer = response['choices'][0]['message']['content'].strip()
    return answer,best_match



In [67]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

genai.configure(api_key='#########################################################')

for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

model = 'models/embedding-001'

data = pd.read_csv("informatika_data.csv")
final_table=pd.DataFrame()
final_table['Text']=data['text']
data=final_table

embeddings_list = []
for index, row in data.iterrows():
    
    embdeddings=genai.embed_content(model=model,
                             content=row,
                             task_type="retrieval_document"
                             )["embedding"]
    embeddings_list.append(embdeddings)

if len(embeddings_list) == len(data):
# Add the embeddings to the DataFrame
  data['Embeddings'] = embeddings_list
else:
  print("Lengths do not match. Something went wrong with embedding generation.")

import numpy as np
model='models/embedding-001'
def find_best_passage(query, dataframe):
  model='models/embedding-001'
  """
  Compute the distances between the query and each document in the dataframe
  using the dot product.
  """
  query_embedding = genai.embed_content(model=model,
                                        content=query,
                                        task_type="retrieval_query")
  dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding["embedding"])
  idx = np.argmax(dot_products)
  return dataframe.iloc[idx]['Text'] # Return text from index with max value

def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  maxOutputTokens=150
  prompt = textwrap.dedent("""si pomocnik ktory pomaha odpovedat studentom ohladom studia.Informacie ziskaj z passage ktore je pridane odpoved maximalne 150 tokenov
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

models/gemini-pro
models/gemini-pro-vision
models/embedding-001


In [68]:
def googleai(otazka):
    passage = find_best_passage(otazka, data)
    prompt = make_prompt(otazka, passage)
    model = genai.GenerativeModel('models/gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [None]:
Otazky=[]

In [89]:
data_table = pd.DataFrame(columns=['Otazka', 'OpenAI_embdeddings', 'vector_db', 'custom_embdeddings','googleai', 'answer',"time_OpenAI_embdeddings","time_vector_db","time_custom_embdeddings","time_googleai"])

In [88]:
programy = [
    "Automobilová elektronika - bakalarskeho studia",
    "Elektroenergetika - inzinierskeho studia",
    "Elektroenergetika - bakalarskeho studia",
    "Fyzikálne inžinierstvo progresívnych materiálov - bakalarskeho studia",
    "Fyzikálne inžinierstvo progresívnych materiálov - inzinierskeho studia",
    "Hospodárska informatika - bakalarskeho studia",
    "Hospodárska informatika - inzinierskeho studia",
    "Informatika - bakalarskeho studia",
    "Informatika - inzinierskeho studia",
    "Inteligentné systémy - bakalarskeho studia",
    "Inteligentné systémy - inzinierskeho studia",
    "Kyberbezpečnosť - bakalarskeho studia",
    "Kyberbezpečnosť - inzinierskeho studia",
    "Počítačové modelovanie - bakalarskeho studia",
    "Počítačové modelovanie inzinierskeho studia",
    "Počítačové siete - bakalarskeho studia",
    "Počítačové siete - inzinierskeho studia",
    "Priemyselná elektrotechnika - bakalarskeho studia",
    "Priemyselná elektrotechnika -inzinierskeho studia"
]


In [98]:
import time

#programy = [
#    "Automobilová elektronika - 1. stupeň, denná forma, slovenský jazyk"
#]


for predmet in programy:
    temp_table=pd.DataFrame(columns=['Otazka', 'OpenAI_embdeddings', 'vector_db', 'custom_embdeddings','googleai', 'answer'])

    otazka=f"opis studijny program:{predmet}"
    temp_table['Otazka']=otazka
    print(f"Aktualny predmet :{predmet}")
    zaciatok_casu = time.time()
    custom_embdeddings,qa_answer=custombased(otazka)
    end_time = time.time()
    time_custom_ombdeedings=end_time-zaciatok_casu
    zaciatok_casu = time.time()
    OpenAI_embdeddings=ask(otazka)
    end_time = time.time()
    time_OpenAI_embdeddings=end_time-zaciatok_casu
    zaciatok_casu = time.time()
    vector_db=rag_chain.invoke(otazka)
    end_time = time.time()
    time_vector_db=end_time-zaciatok_casu
    zaciatok_casu = time.time()
    googleaianswer=googleai(otazka)
    end_time = time.time()
    time_googleai=end_time-zaciatok_casu
    programy.remove(predmet)
    temp_table = pd.DataFrame({
        'Otazka': [otazka],
        'OpenAI_embdeddings': OpenAI_embdeddings,
        'vector_db': vector_db,
        'custom_embdeddings': custom_embdeddings,
        'googleai': googleaianswer,
        'answer': qa_answer,
        'time_OpenAI_embdeddings':time_OpenAI_embdeddings,
        'time_vector_db':time_vector_db,
        'time_custom_embdeddings':time_custom_ombdeedings,
        'time_googleai':time_googleai

    })
    data_table=data_table._append(temp_table, ignore_index=True)
    

Aktualny predmet :Informatika - bakalarskeho studia


In [85]:
googleaianswer

'Študijný program Informatika v bakalárskom štúdiu na Fakulte elektrotechniky a informatiky ponúka vedomosti z kľúčových oblastí informatiky, ako sú algoritmizácia, formálne počítačové jazyky, databázové systémy a počítačové siete. Absolventi sú pripravení pre prácu v oblasti návrhu, implementácie, testovania a údržby softvérových a počítačových systémov a sietí.'

In [99]:
data_table

Unnamed: 0,Otazka,OpenAI_embdeddings,vector_db,custom_embdeddings,googleai,answer,time_OpenAI_embdeddings,time_vector_db,time_custom_embdeddings,time_googleai
0,opis studijny program:Automobilová elektronika...,Bakalárske štúdium Automobilovej elektroniky j...,"The study program ""Automobilová elektronika"" i...",Bakalárske štúdium automobilovej elektroniky j...,Bakalársky študijný program Automobilová elekt...,Obsah študijného programu automobilová elektro...,12.186146,6.452972,14.939081,12.331352
1,opis studijny program:Elektroenergetika - baka...,Bakalárske štúdium elektroenergetiky je zamera...,"The study program ""Elektroenergetika"" is a bac...",Bakalársky študijný program elektroenergetika ...,Elektroenergetika bakalársky stupeň štúdia je ...,- Absolvent bakalárskeho študijného programu e...,12.65651,12.889066,13.86787,7.137296
2,opis studijny program:Fyzikálne inžinierstvo p...,Študijný program Fyzikálne inžinierstvo progre...,"The study program ""Fyzikálne inžinierstvo prog...","Studijný program ""Fyzikálne inžinierstvo progr...",Štúdium v odbore Fyzikálne inžinierstvo progre...,2. Absolvent získava titul: PhD.\n3. Jazyky po...,15.611129,7.583582,15.181135,7.348681
3,opis studijny program:Hospodárska informatika ...,Hospodárska informatika je študijný program in...,"The study program ""Hospodárska informatika"" is...",Študijný program Hospodárska informatika - inž...,Študijný program v odbore hospodárska informat...,Odpoveď: Študijný program hospodárska informat...,12.700673,5.709196,12.928659,7.816226
4,opis studijny program:Informatika - inziniersk...,Študijný program Informatika na inžinierskom s...,Absolvent Študijného programu Informatika - in...,Študijný program Informatika - inžinierskeho š...,Študijný program informatika poskytuje študent...,2. Dĺžka štúdia pre študijný program informati...,14.928457,7.917399,15.676998,6.81366
5,opis studijny program:Inteligentné systémy - i...,Študijný program Inteligentné systémy v inžini...,"The study program ""Inteligentné systémy"" is an...","Študijný program ""Inteligentné systémy"" je inž...",Študijný program Inteligentné systémy na Fakul...,Odpoveď: Obsah štúdia zahŕňa vývoj systémov na...,14.489426,6.624651,14.596315,6.97815
6,opis studijny program:Kyberbezpečnosť - inzini...,Študijný program Kyberbezpečnosť je inžiniersk...,The study program in question is focused on cy...,Študijný program Kyberbezpečnosť je inžiniersk...,Študijný program kyberbezpečnosť na bakalársko...,2. Študenti študijného programu Kyberbezpečnos...,14.454009,6.378105,14.868958,11.611411
7,opis studijny program:Počítačové modelovanie i...,Študijný program Počítačové modelovanie vychov...,"The study program ""Počítačové modelovanie"" foc...",Študijný program Počítačové modelovanie inžini...,Študijný program Počítačové modelovanie vychov...,- Absolvent získa zručnosti na rozvoj tvorivéh...,14.69141,6.06405,14.276293,12.069966
8,opis studijny program:Počítačové siete - inzin...,Studijný program Počítačové siete je súčasťou ...,"The study program ""Počítačové siete"" (Computer...",Studijný program Počítačové siete - inžiniersk...,Študijný program počítačové siete v rámci baka...,Odpoveď: Absolvent získa schopnosť špecifikova...,13.392088,13.038662,14.045063,7.73074
9,opis studijny program:Priemyselná elektrotechn...,Studijný program Priemyselná elektrotechnika j...,"The study program ""Priemyselná elektrotechnika...",Studijný program Priemyselná elektrotechnika s...,Študijný program Priemyselná elektrotechnika v...,Odpoveď: Absolvent bude mať rozsiahle vedomost...,14.218992,8.271997,15.003988,8.207701


In [100]:
data_table['custom_embdeddings'][0]

'Bakalárske štúdium automobilovej elektroniky je zamerané na poskytnutie študentom vedomostí a zručností potrebných pre prácu v oblasti elektroniky v automobilovom priemysle. Študijný program sa zaoberá rôznymi aspektmi automobilovej elektroniky, vrátane návrhu, implementácie a diagnostiky elektronických systémov v automobiloch.\n\nŠtudenti sa naučia o základných princípoch elektroniky, ako aj o špecifických technológiách a komponentoch používaných v automobiloch. Budú sa venovať aj problematike riadenia a regulácie elektronických systémov v automobiloch, ako je napríklad riadenie motorov, brzdových systémov, bezpečnostných systémov a ďalších.\n\nOkrem toho sa študenti budú zao'

In [101]:
test_data=data_table

In [102]:
test_data

Unnamed: 0,Otazka,OpenAI_embdeddings,vector_db,custom_embdeddings,googleai,answer,time_OpenAI_embdeddings,time_vector_db,time_custom_embdeddings,time_googleai
0,opis studijny program:Automobilová elektronika...,Bakalárske štúdium Automobilovej elektroniky j...,"The study program ""Automobilová elektronika"" i...",Bakalárske štúdium automobilovej elektroniky j...,Bakalársky študijný program Automobilová elekt...,Obsah študijného programu automobilová elektro...,12.186146,6.452972,14.939081,12.331352
1,opis studijny program:Elektroenergetika - baka...,Bakalárske štúdium elektroenergetiky je zamera...,"The study program ""Elektroenergetika"" is a bac...",Bakalársky študijný program elektroenergetika ...,Elektroenergetika bakalársky stupeň štúdia je ...,- Absolvent bakalárskeho študijného programu e...,12.65651,12.889066,13.86787,7.137296
2,opis studijny program:Fyzikálne inžinierstvo p...,Študijný program Fyzikálne inžinierstvo progre...,"The study program ""Fyzikálne inžinierstvo prog...","Studijný program ""Fyzikálne inžinierstvo progr...",Štúdium v odbore Fyzikálne inžinierstvo progre...,2. Absolvent získava titul: PhD.\n3. Jazyky po...,15.611129,7.583582,15.181135,7.348681
3,opis studijny program:Hospodárska informatika ...,Hospodárska informatika je študijný program in...,"The study program ""Hospodárska informatika"" is...",Študijný program Hospodárska informatika - inž...,Študijný program v odbore hospodárska informat...,Odpoveď: Študijný program hospodárska informat...,12.700673,5.709196,12.928659,7.816226
4,opis studijny program:Informatika - inziniersk...,Študijný program Informatika na inžinierskom s...,Absolvent Študijného programu Informatika - in...,Študijný program Informatika - inžinierskeho š...,Študijný program informatika poskytuje študent...,2. Dĺžka štúdia pre študijný program informati...,14.928457,7.917399,15.676998,6.81366
5,opis studijny program:Inteligentné systémy - i...,Študijný program Inteligentné systémy v inžini...,"The study program ""Inteligentné systémy"" is an...","Študijný program ""Inteligentné systémy"" je inž...",Študijný program Inteligentné systémy na Fakul...,Odpoveď: Obsah štúdia zahŕňa vývoj systémov na...,14.489426,6.624651,14.596315,6.97815
6,opis studijny program:Kyberbezpečnosť - inzini...,Študijný program Kyberbezpečnosť je inžiniersk...,The study program in question is focused on cy...,Študijný program Kyberbezpečnosť je inžiniersk...,Študijný program kyberbezpečnosť na bakalársko...,2. Študenti študijného programu Kyberbezpečnos...,14.454009,6.378105,14.868958,11.611411
7,opis studijny program:Počítačové modelovanie i...,Študijný program Počítačové modelovanie vychov...,"The study program ""Počítačové modelovanie"" foc...",Študijný program Počítačové modelovanie inžini...,Študijný program Počítačové modelovanie vychov...,- Absolvent získa zručnosti na rozvoj tvorivéh...,14.69141,6.06405,14.276293,12.069966
8,opis studijny program:Počítačové siete - inzin...,Studijný program Počítačové siete je súčasťou ...,"The study program ""Počítačové siete"" (Computer...",Studijný program Počítačové siete - inžiniersk...,Študijný program počítačové siete v rámci baka...,Odpoveď: Absolvent získa schopnosť špecifikova...,13.392088,13.038662,14.045063,7.73074
9,opis studijny program:Priemyselná elektrotechn...,Studijný program Priemyselná elektrotechnika j...,"The study program ""Priemyselná elektrotechnika...",Studijný program Priemyselná elektrotechnika s...,Študijný program Priemyselná elektrotechnika v...,Odpoveď: Absolvent bude mať rozsiahle vedomost...,14.218992,8.271997,15.003988,8.207701


In [103]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import meteor_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize

def custom_similarity_metric(ocakavana_odpoved, vytvorena_odpoved):
    # Vytvorenie TF-IDF vektorizátorov pre očakávanú a vytvorenú odpoveď
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([ocakavana_odpoved, vytvorena_odpoved])

    # Výpočet kosínusovej podobnosti medzi vektormi TF-IDF
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Kosínusová podobnosť pre konkrétny pár odpovedí
    custom_similarity_score = similarity_matrix[0, 1]

    return custom_similarity_score


def compute_bleu_score_OpenAI_embdeddings(row):
    #print(row)
    return sentence_bleu([row['answer']], row['OpenAI_embdeddings'])

def compute_bleu_score_custom_embdeddings(row):
    #print(row)
    return sentence_bleu([row['answer']], row['custom_embdeddings'])

def compute_bleu_score_vector_db(row):
    #print(row)
    return sentence_bleu([row['answer']], row['vector_db'])

def compute_bleu_score_googleai(row):
    #print(row)
    return sentence_bleu([row['answer']], row['googleai'])

def compute_meteor_score_score_OpenAI_embdeddings(row):
    answer=word_tokenize(row['answer'])
    odpoved=word_tokenize(row['OpenAI_embdeddings'])
    
    
    return meteor_score.single_meteor_score(answer, odpoved)

def compute_meteor_score_score_custom_embdeddings(row):
    answer=word_tokenize(row['answer'])
    odpoved=word_tokenize(row['custom_embdeddings'])
    
    
    return meteor_score.single_meteor_score(answer, odpoved)
    
def compute_meteor_score_score_vector_db(row):
    answer=word_tokenize(row['answer'])
    odpoved=word_tokenize(row['vector_db'])
    
    return meteor_score.single_meteor_score(answer, odpoved)

def compute_meteor_score_score_googleai(row):
    answer=word_tokenize(row['answer'])
    odpoved=word_tokenize(row['googleai'])
    
    
    return meteor_score.single_meteor_score(answer, odpoved)
    

def compute_rouge_score_OpenAI_embdeddings(row):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(row['OpenAI_embdeddings'], row['answer'])
    return scores['rougeL'].fmeasure
def compute_rouge_score_custom_embdeddings(row):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(row['custom_embdeddings'], row['answer'])
    return scores['rougeL'].fmeasure
def compute_rouge_score_vector_db(row):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(row['vector_db'], row['answer'])
    return scores['rougeL'].fmeasure

def compute_rouge_score_googleai(row):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(row['googleai'], row['answer'])
    return scores['rougeL'].fmeasure

# Vypocet rouge score
test_data['ROUGE_Score_OpenAI_embdeddings'] = test_data.apply(compute_rouge_score_OpenAI_embdeddings, axis=1)
test_data['ROUGE_Score_custom_embdeddings'] = test_data.apply(compute_rouge_score_custom_embdeddings, axis=1)
test_data['ROUGE_Score_vector_db'] = test_data.apply(compute_rouge_score_vector_db, axis=1)
test_data['ROUGE_googleai'] = test_data.apply(compute_rouge_score_googleai, axis=1)
# Vypocet bleu score
test_data['BLEU_Score_OpenAI_embdeddings'] = test_data.apply(compute_bleu_score_OpenAI_embdeddings, axis=1)
test_data['BLEU_Score_custom_embdeddings'] = test_data.apply(compute_bleu_score_custom_embdeddings, axis=1)
test_data['BLEU_Score_vector_db'] = test_data.apply(compute_bleu_score_vector_db, axis=1)
test_data['BLEU_Score_googleai'] = test_data.apply(compute_bleu_score_googleai, axis=1)
# Vypocet METEOR SCORE
test_data['METEOR_Score_OpenAI_embdeddings'] = test_data.apply(compute_meteor_score_score_OpenAI_embdeddings, axis=1)
test_data['METEOR_Score_custom_embdeddings'] = test_data.apply(compute_meteor_score_score_custom_embdeddings, axis=1)
test_data['METEOR_Score_vector_db'] = test_data.apply(compute_meteor_score_score_vector_db, axis=1)
test_data['METEOR_Score_googleai'] = test_data.apply(compute_meteor_score_score_googleai, axis=1)
#Vypocet similarity_score
test_data['Similarity_Score_OpenAI_embdeddings'] = test_data.apply(lambda row: custom_similarity_metric(row['answer'], row['OpenAI_embdeddings']), axis=1)
test_data['Similarity_Score_custom_embdeddings'] = test_data.apply(lambda row: custom_similarity_metric(row['answer'], row['custom_embdeddings']), axis=1)
test_data['Similarity_Score_vector_db'] = test_data.apply(lambda row: custom_similarity_metric(row['answer'], row['vector_db']), axis=1)
test_data['Similarity_Score_googleai'] = test_data.apply(lambda row: custom_similarity_metric(row['answer'], row['googleai']), axis=1)
test_data


Unnamed: 0,Otazka,OpenAI_embdeddings,vector_db,custom_embdeddings,googleai,answer,time_OpenAI_embdeddings,time_vector_db,time_custom_embdeddings,time_googleai,...,BLEU_Score_vector_db,BLEU_Score_googleai,METEOR_Score_OpenAI_embdeddings,METEOR_Score_custom_embdeddings,METEOR_Score_vector_db,METEOR_Score_googleai,Similarity_Score_OpenAI_embdeddings,Similarity_Score_custom_embdeddings,Similarity_Score_vector_db,Similarity_Score_googleai
0,opis studijny program:Automobilová elektronika...,Bakalárske štúdium Automobilovej elektroniky j...,"The study program ""Automobilová elektronika"" i...",Bakalárske štúdium automobilovej elektroniky j...,Bakalársky študijný program Automobilová elekt...,Obsah študijného programu automobilová elektro...,12.186146,6.452972,14.939081,12.331352,...,0.185117,0.274224,0.231056,0.218946,0.11625,0.330361,0.103696,0.184179,0.018279,0.135338
1,opis studijny program:Elektroenergetika - baka...,Bakalárske štúdium elektroenergetiky je zamera...,"The study program ""Elektroenergetika"" is a bac...",Bakalársky študijný program elektroenergetika ...,Elektroenergetika bakalársky stupeň štúdia je ...,- Absolvent bakalárskeho študijného programu e...,12.65651,12.889066,13.86787,7.137296,...,0.03303,0.111507,0.030488,0.083333,0.034722,0.1,0.0,0.039645,0.020384,0.056366
2,opis studijny program:Fyzikálne inžinierstvo p...,Študijný program Fyzikálne inžinierstvo progre...,"The study program ""Fyzikálne inžinierstvo prog...","Studijný program ""Fyzikálne inžinierstvo progr...",Štúdium v odbore Fyzikálne inžinierstvo progre...,2. Absolvent získava titul: PhD.\n3. Jazyky po...,15.611129,7.583582,15.181135,7.348681,...,0.070255,0.050035,0.056741,0.216998,0.025023,0.049194,0.06465,0.182945,0.016636,0.035809
3,opis studijny program:Hospodárska informatika ...,Hospodárska informatika je študijný program in...,"The study program ""Hospodárska informatika"" is...",Študijný program Hospodárska informatika - inž...,Študijný program v odbore hospodárska informat...,Odpoveď: Študijný program hospodárska informat...,12.700673,5.709196,12.928659,7.816226,...,0.246803,0.346623,0.276876,0.228547,0.116674,0.221881,0.173657,0.106878,0.069233,0.151619
4,opis studijny program:Informatika - inziniersk...,Študijný program Informatika na inžinierskom s...,Absolvent Študijného programu Informatika - in...,Študijný program Informatika - inžinierskeho š...,Študijný program informatika poskytuje študent...,2. Dĺžka štúdia pre študijný program informati...,14.928457,7.917399,15.676998,6.81366,...,0.008352,0.032816,0.142604,0.238137,0.064729,0.049365,0.174773,0.385732,0.166107,0.096797
5,opis studijny program:Inteligentné systémy - i...,Študijný program Inteligentné systémy v inžini...,"The study program ""Inteligentné systémy"" is an...","Študijný program ""Inteligentné systémy"" je inž...",Študijný program Inteligentné systémy na Fakul...,Odpoveď: Obsah štúdia zahŕňa vývoj systémov na...,14.489426,6.624651,14.596315,6.97815,...,0.095587,0.136457,0.163934,0.167064,0.135802,0.150376,0.036072,0.044267,0.0,0.074208
6,opis studijny program:Kyberbezpečnosť - inzini...,Študijný program Kyberbezpečnosť je inžiniersk...,The study program in question is focused on cy...,Študijný program Kyberbezpečnosť je inžiniersk...,Študijný program kyberbezpečnosť na bakalársko...,2. Študenti študijného programu Kyberbezpečnos...,14.454009,6.378105,14.868958,11.611411,...,0.044023,0.142329,0.093061,0.16839,0.035603,0.178991,0.185776,0.284617,0.01699,0.226147
7,opis studijny program:Počítačové modelovanie i...,Študijný program Počítačové modelovanie vychov...,"The study program ""Počítačové modelovanie"" foc...",Študijný program Počítačové modelovanie inžini...,Študijný program Počítačové modelovanie vychov...,- Absolvent získa zručnosti na rozvoj tvorivéh...,14.69141,6.06405,14.276293,12.069966,...,0.10667,0.165463,0.201108,0.142405,0.066667,0.135135,0.066412,0.050319,0.020903,0.048545
8,opis studijny program:Počítačové siete - inzin...,Studijný program Počítačové siete je súčasťou ...,"The study program ""Počítačové siete"" (Computer...",Studijný program Počítačové siete - inžiniersk...,Študijný program počítačové siete v rámci baka...,Odpoveď: Absolvent získa schopnosť špecifikova...,13.392088,13.038662,14.045063,7.73074,...,0.056068,0.216604,0.182615,0.506818,0.076087,0.172178,0.036691,0.168751,0.014406,0.071617
9,opis studijny program:Priemyselná elektrotechn...,Studijný program Priemyselná elektrotechnika j...,"The study program ""Priemyselná elektrotechnika...",Studijný program Priemyselná elektrotechnika s...,Študijný program Priemyselná elektrotechnika v...,Odpoveď: Absolvent bude mať rozsiahle vedomost...,14.218992,8.271997,15.003988,8.207701,...,0.141164,0.318864,0.153133,0.599198,0.083732,0.266682,0.031828,0.317151,0.0,0.132743


In [104]:
test_data.to_csv('final_metrics_20.1.2024.csv', index=True)

In [105]:
test_data.columns.tolist()

['Otazka',
 'OpenAI_embdeddings',
 'vector_db',
 'custom_embdeddings',
 'googleai',
 'answer',
 'time_OpenAI_embdeddings',
 'time_vector_db',
 'time_custom_embdeddings',
 'time_googleai',
 'ROUGE_Score_OpenAI_embdeddings',
 'ROUGE_Score_custom_embdeddings',
 'ROUGE_Score_vector_db',
 'ROUGE_googleai',
 'BLEU_Score_OpenAI_embdeddings',
 'BLEU_Score_custom_embdeddings',
 'BLEU_Score_vector_db',
 'BLEU_Score_googleai',
 'METEOR_Score_OpenAI_embdeddings',
 'METEOR_Score_custom_embdeddings',
 'METEOR_Score_vector_db',
 'METEOR_Score_googleai',
 'Similarity_Score_OpenAI_embdeddings',
 'Similarity_Score_custom_embdeddings',
 'Similarity_Score_vector_db',
 'Similarity_Score_googleai']

In [106]:
old_column_name = 'time_custom_ombdeedings'
new_column_name = 'time_custom_embdeddings'

# Prepísanie názvu stĺpca
test_data.columns = test_data.columns.str.replace(old_column_name, new_column_name)


In [111]:
old_column_name = 'ROUGE_googleai'
new_column_name = 'ROUGE_Score_googleai'

# Prepísanie názvu stĺpca
test_data.columns = test_data.columns.str.replace(old_column_name, new_column_name)

In [112]:
import pandas as pd
import matplotlib.pyplot as plt
from math import pi

# Predpokladajme, že máte DataFrame s názvom 'df' obsahujúci uvedené stĺpce

# Vybrané stĺpce a kategórie
selected_columns = [
    'time_OpenAI_embdeddings',
    'time_vector_db',
    'time_custom_ombdeedings',
    'ROUGE_Score_OpenAI_embdeddings',
    'ROUGE_Score_custom_embdeddings',
    'ROUGE_Score_vector_db',
    'ROUGE_Score_googleai',
    'BLEU_Score_OpenAI_embdeddings',
    'BLEU_Score_custom_embdeddings',
    'BLEU_Score_vector_db',
    'BLEU_Score_googleai',
    'METEOR_Score_OpenAI_embdeddings',
    'METEOR_Score_custom_embdeddings',
    'METEOR_Score_vector_db',
    'METEOR_Score_googleai',
    'Similarity_Score_OpenAI_embdeddings',
    'Similarity_Score_custom_embdeddings',
    'Similarity_Score_vector_db',
    'Similarity_Score_googleai'
]

# Vytvorenie nového DataFrame so zjednotenými hodnotami
aggregated_df = pd.DataFrame()

# Pre každú kategóriu vytvorte priemernú hodnotu
for category in ['OpenAI_embdeddings', 'vector_db', 'custom_embdeddings','googleai']:
    category_columns = [f'{metric}_{category}' for metric in ['time', 'ROUGE_Score', 'BLEU_Score', 'METEOR_Score', 'Similarity_Score']]
    category_data = test_data[category_columns].mean().to_frame(name=category).T
    aggregated_df = pd.concat([aggregated_df, category_data])

# Normalizácia hodnôt pre "spider chart"
normalized_data = (aggregated_df - aggregated_df.min()) / (aggregated_df.max() - aggregated_df.min())



In [113]:
aggregated_df

Unnamed: 0,time_OpenAI_embdeddings,ROUGE_Score_OpenAI_embdeddings,BLEU_Score_OpenAI_embdeddings,METEOR_Score_OpenAI_embdeddings,Similarity_Score_OpenAI_embdeddings,time_vector_db,ROUGE_Score_vector_db,BLEU_Score_vector_db,METEOR_Score_vector_db,Similarity_Score_vector_db,time_custom_embdeddings,ROUGE_Score_custom_embdeddings,BLEU_Score_custom_embdeddings,METEOR_Score_custom_embdeddings,Similarity_Score_custom_embdeddings,time_googleai,ROUGE_Score_googleai,BLEU_Score_googleai,METEOR_Score_googleai,Similarity_Score_googleai
OpenAI_embdeddings,13.810143,0.112301,0.181143,0.135572,0.083492,,,,,,,,,,,,,,,
vector_db,,,,,,8.147847,0.054367,0.12065,0.076617,0.031739,,,,,,,,,,
custom_embdeddings,,,,,,,,,,,14.36799,0.209367,0.211747,0.221799,0.159559,,,,,
googleai,,,,,,,,,,,,,,,,9.496549,0.143213,0.184096,0.152912,0.109613


In [3]:
import plotly.graph_objects as go

categories = ['Time','ROUGE_Score','BLEU_Score',
              'METEOR_Score', 'Similarity_Score']

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=[13.810143, (0.112301	*100), (0.181143*100), (0.135572*100),(0.083492	*100)],
      theta=categories,
      fill='toself',
      name='Priama integrácia s OpenAI API (Prvý prístup)'
))
fig.add_trace(go.Scatterpolar(
      r=[8.147847, 0.054367*100,0.12065*100, 0.076617*100, 0.031739*100],
      theta=categories,
      fill='toself',
      name='Rozšírené vyhľadávanie s LangChain (Druhý prístup)'
))

fig.add_trace(go.Scatterpolar(
      r=[14.36799, 0.209367*100, 0.211747	*100,0.221799*100, 0.159559*100],
      theta=categories,
      fill='toself',
      name='Dynamické vyhľadávanie (Tretí prístup)'
))



fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 25]
    )),
  showlegend=True
)

fig.show()