# Homework: Evaluation and Monitoring


### Imports

In [1]:
from sentence_transformers import SentenceTransformer
import requests 
import numpy as np
from tqdm.notebook import trange, tqdm
import pandas as pd
from elasticsearch import Elasticsearch
from rouge import Rouge

  from tqdm.autonotebook import tqdm, trange


## Q1. Getting the embeddings model

In first place, let's get the data.

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


We will use only the first 300 documents:

In [3]:
df = df.iloc[:300]

Now that we have the data, we can get the model that will generate the embeddings.

In [4]:
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

We will use this model to compute the embedding of the LLM answer in the first row of our dataset.

In [5]:
answer_llm = df.iloc[0].answer_llm
answer_llm_emb = embedding_model.encode(answer_llm)
print(answer_llm_emb[0])

-0.42244655


**Answer**: -0.42


## Q2. Computing the dot product

We will compute the similarity between each pair (LLM answer and original answer) using the dot product.

In [6]:
# Convert dataframe to dict
df_dict = df.to_dict(orient="records")

# Compute dot product and save all the results in a list
evaluations = []

for doc in tqdm(df_dict):
    # Compute embeddings
    doc["answer_llm_emb"] = embedding_model.encode(doc["answer_llm"])
    doc["answer_orig_emb"] = embedding_model.encode(doc["answer_orig"])
    # Compute dot product
    evaluations.append(np.dot(doc["answer_llm_emb"], doc["answer_orig_emb"]))

  0%|          | 0/300 [00:00<?, ?it/s]

In [7]:
# Convert list to array and calculate 75%
eval_array = np.asarray(evaluations)
percentile = np.percentile(eval_array, 75)
print(percentile)

31.67430877685547


**Answer**: 31.67


## Q3. Computing the cosine

We can see that the results are not between 0 and 1.

In [8]:
print(evaluations[:10])

[17.515987, 13.418402, 25.313255, 12.147415, 18.747736, 33.970406, 30.251705, 29.521576, 35.272198, 27.751772]


The model used to generate the embeddings does not generate normalized vectors, so let's normalize them.

In [9]:
def normalize_vec(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [10]:
evaluations = []

for doc in tqdm(df_dict):
    # Compute normalization of the embeddings
    doc["answer_llm_emb_norm"] = normalize_vec(doc["answer_llm_emb"])
    doc["answer_orig_emb_norm"] = normalize_vec(doc["answer_orig_emb"])
    # Compute dot product
    evaluations.append(np.dot(doc["answer_llm_emb_norm"], doc["answer_orig_emb_norm"]))

  0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
# Convert list to array and calculate 75%
eval_array = np.asarray(evaluations)
percentile = np.percentile(eval_array, 75)
print(percentile)

0.8362348973751068


**Answer**: 0.83


## Q4. Rouge

Let's now use a different similarity metric, Rouge score.

In [12]:
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df_dict[10]['answer_llm'], df_dict[10]['answer_orig'])[0]
print(scores['rouge-1']['f'])

0.45454544954545456


**Answer**: 0.45


## Q5. Average rouge score

We need to compute the average between rouge-1, rouge-2 and rouge-l for the same record in Q4, but it's not specified if only for the f-score or for all the metrics (f-score, precission and recall). So let's compute the both cases. 

In [13]:
def avg_rouge_f_score(scores):
    return (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f'])/3

def avg_rouge_total_score(scores):
    total = 0
    count = 0
    
    for k, v in scores.items():
        for x, y in v.items():
            count = count + 1
            total = total + y

    return total/count


print(avg_rouge_f_score(scores))
print(avg_rouge_total_score(scores))

0.35490034990035496
0.3549003532336883


In this case, both averages are more or less the same.

**Answer**: 0.35


## Q6. Average rouge score for all the data points

In this case it is specified that we need to calculate the Rouge-2 f-score average for all the records.

In [16]:
rouge_2_f_scores = []

for doc in tqdm(df_dict):
    scores = rouge_scorer.get_scores(doc['answer_llm'], doc['answer_orig'])[0]
    rouge_2_f_scores.append(scores['rouge-2']['f'])

  0%|          | 0/300 [00:00<?, ?it/s]

In [19]:
avg = sum(rouge_2_f_scores)/len(rouge_2_f_scores)
print(avg)

0.20696501983423318


**Answer**: 0.20