In [1]:
import pandas as pd

github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [2]:
df = df.iloc[:300]

### Q1. Getting the embeddings model



In [3]:
model_name = "multi-qa-mpnet-base-dot-v1"

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
answer_llm = df.iloc[0].answer_llm

print(f"The first value of the resulting vector is {embedding_model.encode(answer_llm)[0]:.2f}")

The first value of the resulting vector is -0.42


### Q2. Computing the dot product



In [12]:
import numpy as np

evaluations = np.array([])

for i in range(len(df)):
    first = df.iloc[i].answer_llm
    second = df.iloc[i].answer_orig
    evaluations = np.append(evaluations, np.dot(embedding_model.encode(first), embedding_model.encode(second)))

In [14]:
print(f"The 75% percentile of the score is {np.percentile(evaluations, 75):.2f}")

The 75% percentile of the score is 31.67


### Q3. Computing the cosine



In [16]:
def norm(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

evaluations_cosine = np.array([])

for i in range(len(df)):
    first = df.iloc[i].answer_llm
    second = df.iloc[i].answer_orig
    evaluations_cosine = np.append(
        evaluations_cosine, 
        np.dot(
            norm(embedding_model.encode(first)), 
            norm(embedding_model.encode(second))
            )
        )

In [18]:
print(f"The 75% cosine in the scores is {np.percentile(evaluations_cosine, 75):.2f}")

The 75% cosine in the scores is 0.84


### Q4. Rouge



In [None]:
%pip install rouge

In [20]:
r = df.iloc[10]
r

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [21]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [24]:
print(f"The F score for rouge-1 is {scores['rouge-1']['f']:.2f}")

The F score for rouge-1 is 0.45


### Q5. Average rouge score



In [26]:
res = np.average(np.array([scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']]))

print(f"The average F-score between rouge-1, rouge-2 and rouge-l for the same record from Q4 is {res:.2f}")

The average F-score between rouge-1, rouge-2 and rouge-l for the same record from Q4 is 0.35


### Q6. Average rouge score for all the data points



In [27]:
scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])

scores_df = pd.DataFrame(scores)

In [54]:
print(f"The average rouge_2 across all the records is {np.mean([v['f'] for v in scores_df['rouge-2']]):.2f}")

The average rouge_2 across all the records is 0.21
