##### Getting the data

In [30]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange
from rouge import Rouge

In [3]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [4]:
df = df.iloc[:300]

##### Q1. Getting the embeddings model

In [5]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [6]:
answer_llm = df.iloc[0].answer_llm

In [7]:
embedding_model.encode(answer_llm)[0]

-0.42244655

##### Q2

In [8]:
answer_orig = df.iloc[0].answer_orig

In [10]:
answer_llm_emb = embedding_model.encode(answer_llm)
answer_orig_emb = embedding_model.encode(answer_orig)

answer_llm_emb.dot(answer_orig_emb)

17.515987

In [13]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [18]:
results = df.to_dict(orient='records')

evaluations = []

for record in tqdm(results):
    sim = compute_similarity(record)
    evaluations.append(sim)
    # print(record)

  0%|          | 0/300 [00:00<?, ?it/s]

100%|██████████| 300/300 [02:21<00:00,  2.12it/s]


In [21]:
eval_df = pd.DataFrame()
eval_df["evaluations"] = evaluations
eval_df["evaluations"].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547924
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: evaluations, dtype: float64

##### Q3

In [24]:
def vector_normalization(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

def compute_similarity_norm(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)    
    v_orig = embedding_model.encode(answer_orig)

    v_llm_norm = vector_normalization(v_llm)
    v_orig_norm = vector_normalization(v_orig)
    
    return v_llm_norm.dot(v_orig_norm)

In [28]:
results_norm = df.to_dict(orient='records')

evaluations_norm = []

for record in tqdm(results):
    sim = compute_similarity_norm(record)
    evaluations_norm.append(sim)



100%|██████████| 300/300 [02:21<00:00,  2.13it/s]


In [29]:
eval_norm_df = pd.DataFrame()
eval_norm_df["evaluations"] = evaluations_norm
eval_norm_df["evaluations"].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: evaluations, dtype: float64

##### Q4

In [35]:
r = df.iloc[10]

rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

##### Q5

In [36]:
rouge_1_f = scores["rouge-1"]["f"]
rouge_2_f = scores["rouge-2"]["f"]
rouge_l_f = scores["rouge-l"]["f"]

rouge_f_avg = (rouge_1_f + rouge_2_f + rouge_l_f) / 3
rouge_f_avg

0.35490034990035496

##### Q6

In [44]:
def rouge_scores(rouge_type):

    rouge_scorer = Rouge()
    scores = []

    for r in df.to_dict(orient="records"):
        score = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
        scores.append(score[rouge_type]["f"])

    return scores

In [45]:
rouge_df = pd.DataFrame()
rouge_df["rouge_2"] = rouge_scores("rouge-2")
rouge_df["rouge_2"].describe()

count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rouge_2, dtype: float64