## Getting the data


In [1]:
import pandas as pd
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

In [2]:
df

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
295,An alternative way to load the data using the ...,Above users showed how to load the dataset dir...,8d209d6d,What is an alternative way to load the data us...,machine-learning-zoomcamp
296,You can directly download the dataset from Git...,Above users showed how to load the dataset dir...,8d209d6d,How can I directly download the dataset from G...,machine-learning-zoomcamp
297,You can fetch data for homework using the `req...,Above users showed how to load the dataset dir...,8d209d6d,Could you share a method to fetch data for hom...,machine-learning-zoomcamp
298,If the status code is 200 when downloading dat...,Above users showed how to load the dataset dir...,8d209d6d,What should I do if the status code is 200 whe...,machine-learning-zoomcamp


##  Q1. Getting the embeddings model


In [3]:
from sentence_transformers import SentenceTransformer
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [4]:
answer_llm = df.iloc[0].answer_llm
embedding = embedding_model.encode(answer_llm)
embedding

array([-4.22446549e-01, -2.24856257e-01, -3.24058414e-01, -2.84758478e-01,
        7.25642918e-03,  1.01186566e-01,  1.03716910e-01, -1.89983174e-01,
       -2.80599259e-02,  2.71588802e-01, -1.15337655e-01,  1.14666030e-01,
       -8.49586725e-02,  3.32365334e-01,  5.52720726e-02, -2.22195774e-01,
       -1.42540857e-01,  1.02519155e-01, -1.52333647e-01, -2.02912465e-01,
        1.98422875e-02,  8.38149190e-02, -5.68632066e-01,  2.32844148e-02,
       -1.67292684e-01, -2.39256918e-01, -8.05464387e-02,  2.57084146e-02,
       -8.15464780e-02, -7.39290118e-02, -2.61550009e-01,  1.92575473e-02,
        3.22909206e-01,  1.90357104e-01, -9.34726413e-05, -2.13165611e-01,
        2.88943425e-02, -1.79530401e-02, -5.92756271e-02,  1.99918285e-01,
       -4.75170948e-02,  1.71634093e-01, -2.45917086e-02, -9.38061550e-02,
       -3.57002735e-01,  1.33263692e-01,  1.94045901e-01, -1.18530318e-01,
        4.56915230e-01,  1.47728190e-01,  3.35945129e-01, -1.86959356e-01,
        2.45954901e-01, -

### What's the first value of the resulting vector?



In [5]:
embedding[0]

-0.42244655

### Answer
-0.42


## Q2. Computing the dot product


Now for each answer pair, let's create embeddings and compute dot product between them

We will put the results (scores) into the `evaluations` list

In [6]:
from tqdm import tqdm
def get_embedding(model, text_list):
    embeddings = []
    for text in tqdm(text_list):
        emb = model.encode(text)
        embeddings.append(emb)
    return embeddings
    
# def get_embedding_dict(model, text_list, name):
#     embedding_dict = {name: []}
#     for text in tqdm(text_list):
#         embedding = model.encode(text)
#         embedding_dict[name].append(embedding)
#     return embedding_dict


In [7]:
# from functools import partial
# def get_embedding_pd(model, record):
#     return model.encode(record)
# model_get_embedding = partial(get_embedding_pd, embedding_model)
# df['answer_llm_embedding'] = df['answer_llm'].apply(model_get_embedding)

In [8]:
df.columns

Index(['answer_llm', 'answer_orig', 'document', 'question', 'course'], dtype='object')

In [9]:
df['answer_llm_embedding'] = get_embedding(model=embedding_model, text_list=df['answer_llm'].to_list())
df['answer_orig_embedding'] = get_embedding(model=embedding_model, text_list=df['answer_orig'].to_list())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:36<00:00,  8.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:49<00:00,  6.08it/s]


In [10]:
# answer_llm_embedding_dict = get_embedding_dict(model=embedding_model, text_list=df['answer_llm'].to_list(), name='answer_llm_embedding')
# answer_orig_embedding_dict = get_embedding_dict(model=embedding_model, text_list=df['answer_orig'].to_list(), name='answer_orig_embedding')

In [12]:
import numpy as np
answer_llm_embedding =  np.stack(df['answer_llm_embedding'].to_numpy())
answer_orig_embedding =  np.stack(df['answer_orig_embedding'].to_numpy())
print(answer_llm_embedding.shape)
print(answer_orig_embedding.shape)

(300, 768)
(300, 768)


In [13]:
evaluations = answer_llm_embedding.dot(answer_orig_embedding.T)

In [14]:
df['dot_product'] = [a.dot(b) for a, b in zip(df['answer_llm_embedding'], df['answer_orig_embedding'])]

In [15]:
df['dot_product'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: dot_product, dtype: float64

### What's the 75% percentile of the score?

In [16]:
df['dot_product'].describe()['75%']

31.67430877685547

### Answer

31.67


## Q3. Computing the cosine


From Q2, we can see that the results are not within the [0, 1] range. It's because the vectors coming from this model are not normalized.

So we need to normalize them.

To do it, we 

* Compute the norm of a vector
* Divide each element by this norm

So, for vector `v`, it'll be `v / ||v||`

In numpy, this is how you do it:

```python
norm = np.sqrt((v * v).sum())
v_norm = v / norm
```

Let's put it into a function and then compute dot product 
between normalized vectors. This will give us cosine similarity

What's the 75% cosine in the scores?

* 0.63
* 0.73
* 0.83
* 0.93

In [17]:
def cacl_cosine(a, b):
    numerator = a.dot(b)
    denominator = np.sqrt((a*a).sum()) * np.sqrt((b*b).sum())
    return numerator / denominator
df['cosine_score'] =  [cacl_cosine(a, b) for a, b in zip(df['answer_llm_embedding'], df['answer_orig_embedding'])]

### What's the 75% cosine in the scores?

In [18]:
df['cosine_score'].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_score, dtype: float64

In [19]:
df['cosine_score'].describe()['75%']

0.836234837770462

### Answer

0.83


## Q4. Rouge


## Q4. Rouge

Now we will explore an alternative metric - the ROUGE score.  

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

We don't need to implement it ourselves, there's a python package for it:

```bash
pip install rouge
```

(The latest version at the moment of writing is `1.0.1`)

Let's compute the ROUGE score between the answers at the index 10 of our dataframe (`doc_id=5170565b`)

```
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
```

There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and precision, recall and F1 score for each.

* `rouge-1` - the overlap of unigrams,
* `rouge-2` - bigrams,
* `rouge-l` - the longest common subsequence

What's the F score for `rouge-1`?

- 0.35
- 0.45
- 0.55
- 0.65


In [20]:
# !pip install rouge

In [21]:
from rouge import Rouge
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df['answer_llm'].loc[10], df['answer_orig'].loc[10])
scores2 = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])[10]
scores_full = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])



In [22]:
scores

[{'rouge-1': {'r': 0.45454545454545453,
   'p': 0.45454545454545453,
   'f': 0.45454544954545456},
  'rouge-2': {'r': 0.21621621621621623,
   'p': 0.21621621621621623,
   'f': 0.21621621121621637},
  'rouge-l': {'r': 0.3939393939393939,
   'p': 0.3939393939393939,
   'f': 0.393939388939394}}]

In [23]:
scores2

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

### What's the F score for rouge-1?



In [24]:
scores2['rouge-1']['f']

0.45454544954545456

### Answer`

0.45


### Q5. Average rouge score


In [25]:
scores2

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

Let's compute the average between `rouge-1`, `rouge-2` and `rouge-l` for the same record from Q4

- 0.35
- 0.45
- 0.55
- 0.65

### Answer

In [26]:
np.mean([scores2['rouge-1']['f'], scores2['rouge-2']['f'], scores2['rouge-l']['f']])

0.35490034990035496

0.35

## Q6. Average rouge score for all the data points


In [27]:
rouge_1 = scores_full['rouge-1']['f']
rouge_2 = scores_full['rouge-2']['f']
rouge_l = scores_full['rouge-l']['f']
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3

TypeError: list indices must be integers or slices, not str

In [28]:
rouge_2 = [record['rouge-2']['f'] for record in scores_full]

In [29]:
np.mean(rouge_2)

0.20696501983423318

### Answer

0.20
