In [2]:
import pandas as pd

In [3]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

In [4]:
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [5]:
df = df.iloc[:300]

In [6]:
model_name = "multi-qa-mpnet-base-dot-v1"

In [7]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





## Question 1. Embedding first value

In [9]:
answer_llm = df.iloc[0].answer_llm
embedding_model.encode(answer_llm)[0]

-0.42244658

## Q2. Computing the dot product

In [20]:
from tqdm.auto import tqdm

In [37]:
import numpy as np
def cosine_similarity(vec1, vec2, dot_product):
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [39]:
evaluations = []
dot_products= []

for row in tqdm(df.to_dict(orient='records')):
    llm = row["answer_llm"]
    orig = row["answer_orig"]
    llm_emb = embedding_model.encode(llm)
    orig_emb = embedding_model.encode(orig)
    dot_product = llm_emb.dot(orig_emb)
    dot_products.append(dot_product)
    evaluations.append(cosine_similarity(llm_emb, orig_emb, dot_product))

100%|██████████| 300/300 [01:33<00:00,  3.22it/s]


In [40]:
df['dot_products'] = dot_products
df['cosine'] = evaluations

In [41]:
df['dot_products'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547924
25%       24.307847
50%       28.336872
75%       31.674312
max       39.476013
Name: dot_products, dtype: float64

## Question 3. Cosine 75 percentile

In [47]:
df['cosine'].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine, dtype: float64

## Q4. Rouge

In [48]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [56]:
r = df.iloc[10]

In [57]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [59]:
scores['rouge-1']['f']

0.45454544954545456

## Q5. Average rouge score

In [61]:
np.average([rouge['f'] for rouge in scores.values()])

0.35490034990035496

## Q6. Average rouge score for all the data points

In [64]:
all_scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])

In [66]:
np.average([score['rouge-2']['f'] for score in all_scores])

0.20696501983423318