##### Getting the data

In [30]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange
from rouge import Rouge

In [3]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [4]:
df = df.iloc[:300]

##### Q1. Getting the embeddings model

In [5]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [6]:
answer_llm = df.iloc[0].answer_llm

In [46]:
print(f"The question: What's the first value of the resulting vector?\nThe answer: {embedding_model.encode(answer_llm)[0]}")

The question: What's the first value of the resulting vector?
The answer: -0.4224465489387512


##### Q2. Computing the dot product

In [8]:
answer_orig = df.iloc[0].answer_orig

In [10]:
answer_llm_emb = embedding_model.encode(answer_llm)
answer_orig_emb = embedding_model.encode(answer_orig)

answer_llm_emb.dot(answer_orig_emb)

17.515987

In [49]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [50]:
results = df.to_dict(orient='records')

evaluations = []

for record in tqdm(results):
    sim = compute_similarity(record)
    evaluations.append(sim)
    # print(record)

100%|██████████| 300/300 [02:23<00:00,  2.09it/s]


In [52]:
eval_df = pd.DataFrame()
eval_df["evaluations"] = evaluations
print(f'The question: What\'s the 75% percentile of the score?\nThe answer: {eval_df["evaluations"].describe()["75%"]}')

The question: What's the 75% percentile of the score?
The answer: 31.67430877685547


##### Q3. Computing the cosine

In [24]:
def vector_normalization(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

def compute_similarity_norm(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)    
    v_orig = embedding_model.encode(answer_orig)

    v_llm_norm = vector_normalization(v_llm)
    v_orig_norm = vector_normalization(v_orig)
    
    return v_llm_norm.dot(v_orig_norm)

In [28]:
results_norm = df.to_dict(orient='records')

evaluations_norm = []

for record in tqdm(results):
    sim = compute_similarity_norm(record)
    evaluations_norm.append(sim)



100%|██████████| 300/300 [02:21<00:00,  2.13it/s]


In [53]:
eval_norm_df = pd.DataFrame()
eval_norm_df["evaluations"] = evaluations_norm
print(f'The question: What\'s the 75% cosine in the scores?\nThe answer: {eval_norm_df["evaluations"].describe()["75%"]}')

The question: What's the 75% cosine in the scores?
The answer: 0.8362348973751068


##### Q4. Rouge

In [54]:
r = df.iloc[10]

rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
print(f"The question: What's the F score for rouge-1?\nThe answer: {scores['rouge-1']['f']}")

The question: What's the F score for rouge-1?
The answer: 0.45454544954545456


##### Q5. Average rouge score

In [56]:
rouge_1_f = scores["rouge-1"]["f"]
rouge_2_f = scores["rouge-2"]["f"]
rouge_l_f = scores["rouge-l"]["f"]

rouge_f_avg = (rouge_1_f + rouge_2_f + rouge_l_f) / 3
print(f"The task: Let's compute the average F-score between rouge-1, rouge-2 and rouge-l for the same record from Q4.\nThe answer:{rouge_f_avg}")

The task: Let's compute the average F-score between rouge-1, rouge-2 and rouge-l for the same record from Q4.
The answer:0.35490034990035496


##### Q6. Average rouge score for all the data points

In [44]:
def rouge_scores(rouge_type):

    rouge_scorer = Rouge()
    scores = []

    for r in df.to_dict(orient="records"):
        score = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
        scores.append(score[rouge_type]["f"])

    return scores

In [58]:
rouge_df = pd.DataFrame()
rouge_df["rouge_2"] = rouge_scores("rouge-2")
rouge_df["rouge_2"].describe()
print(f"The question: What's the average rouge_2 across all the records?\nThe answer: {rouge_df['rouge_2'].describe()['mean']}")

The question: What's the average rouge_2 across all the records?
The answer: 0.20696501983423318
