# Homework

## Getting the data

In [1]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o-mini.csv'
generated_gpt4_mini = f'{base_url}/{relative_url}?raw=1'

df = pd.read_csv(generated_gpt4_mini)
#df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
#ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
df = df.iloc[:300]

In [3]:
df.shape

(300, 5)

## Q1. Getting the embeddings model

In [4]:
model_name = "multi-qa-mpnet-base-dot-v1"

In [6]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

In [7]:
df.columns

Index(['answer_llm', 'answer_orig', 'document', 'question', 'course'], dtype='object')

In [8]:
answer_llm = df.iloc[0].answer_llm


In [9]:
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [10]:
v_llm = embedding_model.encode(answer_llm)


In [11]:
v_llm

array([-4.22446787e-01, -2.24855945e-01, -3.24058473e-01, -2.84758747e-01,
        7.25685246e-03,  1.01186700e-01,  1.03716828e-01, -1.89983383e-01,
       -2.80596819e-02,  2.71588862e-01, -1.15337133e-01,  1.14666067e-01,
       -8.49587619e-02,  3.32365334e-01,  5.52725159e-02, -2.22195953e-01,
       -1.42540991e-01,  1.02519318e-01, -1.52333513e-01, -2.02912480e-01,
        1.98424123e-02,  8.38148072e-02, -5.68632185e-01,  2.32843738e-02,
       -1.67292967e-01, -2.39256978e-01, -8.05461779e-02,  2.57081706e-02,
       -8.15464854e-02, -7.39289895e-02, -2.61550128e-01,  1.92576721e-02,
        3.22909206e-01,  1.90357178e-01, -9.34726413e-05, -2.13165879e-01,
        2.88942400e-02, -1.79529544e-02, -5.92760742e-02,  1.99918360e-01,
       -4.75170761e-02,  1.71634018e-01, -2.45914217e-02, -9.38058272e-02,
       -3.57002944e-01,  1.33263648e-01,  1.94046125e-01, -1.18530720e-01,
        4.56915259e-01,  1.47728026e-01,  3.35945129e-01, -1.86959922e-01,
        2.45954573e-01, -

In [12]:

v_orig = embedding_model.encode(df.iloc[0].answer_orig)

v_llm.dot(v_orig)

17.515997

## Q2. Computing the dot product

In [13]:
results_gpt4omini_list = df.to_dict(orient="records")
results_gpt4omini_list

[{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
  'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
  'document': '0227b872',
  'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp'},
 {'answer_llm': 'You can sign up using the link provided in the course GitHub repository: [https://airtable.com/shryxwLd0COOEaqXo](https://airtable.com/shryxwLd0COOEaqXo).',
  'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our

In [14]:
results_gpt4omini_list[0]['answer_llm']

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [15]:
df.columns

Index(['answer_llm', 'answer_orig', 'document', 'question', 'course'], dtype='object')

In [16]:
from tqdm.auto import tqdm


In [17]:
evaluations = []

for record in tqdm(results_gpt4omini_list):
    v_llm = embedding_model.encode(record['answer_llm'])
    v_orig = embedding_model.encode(record['answer_orig'])
    dp = v_llm.dot(v_orig)
    evaluations.append(dp) #dotproduct = compute_similarity(record)
    

  0%|          | 0/300 [00:00<?, ?it/s]

In [18]:
df_evaluations= pd.DataFrame()
df_evaluations['score'] = evaluations

In [19]:
df_evaluations.describe()

Unnamed: 0,score
count,300.0
mean,27.495996
std,6.384743
min,4.547926
25%,24.307844
50%,28.336861
75%,31.674304
max,39.476013


## Q3. Computing the cosine

In [24]:
import numpy as np

In [25]:
def normalizing(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm
    

In [26]:
normalizing(v_llm)

array([-2.11306959e-02, -1.53223593e-02, -3.43620703e-02,  1.98937692e-02,
        2.64813774e-04, -2.77096275e-02, -1.63180511e-02,  3.27728651e-02,
        1.02764834e-02,  4.32049185e-02,  2.97470968e-02, -1.64741985e-02,
        4.28259596e-02,  8.87097139e-03,  2.40928703e-03, -4.06071218e-03,
       -2.75038555e-02,  2.57525593e-02, -6.17438108e-02,  1.62241049e-02,
       -3.53289098e-02,  5.32247871e-02,  5.25180344e-03,  2.28471104e-02,
        3.92429205e-03,  1.27627961e-02,  1.35364877e-02,  7.77296051e-02,
       -3.22006419e-02, -6.01691008e-02, -1.34758176e-02,  1.85390767e-02,
       -1.31161604e-03,  2.39917058e-02, -1.60573945e-05, -1.42468023e-03,
        9.87560004e-02, -1.14630526e-02, -3.72214094e-02, -3.54782380e-02,
       -5.72972447e-02, -4.96582240e-02, -2.75296974e-03, -4.27222028e-02,
        5.11458032e-02,  1.45730404e-02,  4.68182117e-02,  3.17602456e-02,
        5.42251505e-02,  4.59975377e-02,  5.68841100e-02,  1.83609016e-02,
        2.98353862e-02, -

In [27]:
evaluations

[17.515997,
 13.4184065,
 25.31325,
 12.147419,
 18.747728,
 33.970398,
 30.251701,
 29.521585,
 35.2722,
 27.751759,
 32.34471,
 31.441847,
 36.38073,
 33.340515,
 30.606163,
 32.503044,
 29.674446,
 24.353462,
 20.13246,
 23.995468,
 30.880272,
 32.692432,
 30.049168,
 16.078167,
 31.796417,
 37.98001,
 20.839043,
 32.612854,
 38.894203,
 34.051826,
 28.26388,
 27.124832,
 23.975266,
 26.34014,
 18.658117,
 25.016396,
 21.101133,
 33.726788,
 29.340345,
 28.65449,
 29.608582,
 30.810736,
 33.3312,
 26.220486,
 26.550077,
 13.148602,
 12.962549,
 12.275612,
 9.974444,
 10.883927,
 29.84507,
 32.36179,
 22.187178,
 30.268929,
 25.091877,
 32.742783,
 28.22099,
 27.274975,
 24.208641,
 22.568905,
 19.767452,
 18.679333,
 20.422318,
 22.051323,
 18.188011,
 28.455889,
 25.919704,
 23.332329,
 22.205935,
 28.296305,
 39.23055,
 36.758514,
 31.913893,
 31.202858,
 36.91305,
 30.514198,
 36.261467,
 27.397549,
 37.7928,
 23.297688,
 34.25258,
 34.550613,
 30.316456,
 35.70352,
 31.012527,
 

In [28]:
evaluations_q3 = []

for record in tqdm(results_gpt4omini_list):
    v_llm = embedding_model.encode(record['answer_llm'])
    v_orig = embedding_model.encode(record['answer_orig'])
    #dp = v_llm.dot(v_orig)
    n_v_llm= normalizing(v_llm)
    n_v_orig= normalizing(v_orig) 
    dp2 = n_v_llm.dot(n_v_orig)
    evaluations_q3.append(dp2) #dotproduct = compute_similarity(record)

  0%|          | 0/300 [00:00<?, ?it/s]

In [29]:
type(evaluations_q3)

list

In [30]:
df_evaluations_q3= pd.DataFrame()
df_evaluations_q3['normalized_score'] = evaluations_q3

In [32]:
df_evaluations_q3.describe()

Unnamed: 0,normalized_score
count,300.0
mean,0.728392
std,0.157755
min,0.125357
25%,0.651273
50%,0.763761
75%,0.836235
max,0.958796


## Q4. Rouge

In [33]:
#!pip install rouge


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [35]:
r = results_gpt4omini_list[10]
r

{'answer_llm': "Yes, all sessions are recorded, so if you miss one, you won't miss anything. You can catch up on the content later. Additionally, you can submit your questions in advance for office hours, and those sessions are also recorded.",
 'answer_orig': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
 'document': '5170565b',
 'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp'}

In [36]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [37]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

#### What's the F score for rouge-1?

 'f': 0.45454544954545456},

## Q5. Average rouge score

In [None]:
average_F_score = scores['rouge1']

In [39]:
scores['rouge-1']['f']

0.45454544954545456

In [49]:
fscores = [scores[k]['f'] for k in scores.keys() ]

In [50]:
fscores

[0.45454544954545456, 0.21621621121621637, 0.393939388939394]

In [51]:
average_f_score = sum(fscores)/len(fscores)

In [52]:
average_f_score

0.35490034990035496

## Q6. Average rouge score for all the data points

In [53]:
rouge_scores = []


for record in tqdm(results_gpt4omini_list):
   #v_llm = embedding_model.encode(record['answer_llm'])
    #v_orig = embedding_model.encode(record['answer_orig'])

    rouge_score = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_scores.append(rouge_score)
    #dp = v_llm.dot(v_orig)
    #n_v_llm= normalizing(v_llm)
    #n_v_orig= normalizing(v_orig) 
    #dp2 = n_v_llm.dot(n_v_orig)
    #evaluations_q3.append(dp2) #dotproduct = compute_similarity(record)

  0%|          | 0/300 [00:00<?, ?it/s]

In [56]:
len(rouge_scores), rouge_scores

(300,
 [{'rouge-1': {'r': 0.061224489795918366,
    'p': 0.21428571428571427,
    'f': 0.09523809178130524},
   'rouge-2': {'r': 0.017543859649122806,
    'p': 0.07142857142857142,
    'f': 0.028169010918468917},
   'rouge-l': {'r': 0.061224489795918366,
    'p': 0.21428571428571427,
    'f': 0.09523809178130524}},
  {'rouge-1': {'r': 0.08163265306122448,
    'p': 0.26666666666666666,
    'f': 0.12499999641113292},
   'rouge-2': {'r': 0.03508771929824561,
    'p': 0.13333333333333333,
    'f': 0.05555555225694465},
   'rouge-l': {'r': 0.061224489795918366, 'p': 0.2, 'f': 0.09374999641113295}},
  {'rouge-1': {'r': 0.32653061224489793,
    'p': 0.5714285714285714,
    'f': 0.41558441095631643},
   'rouge-2': {'r': 0.14035087719298245,
    'p': 0.24242424242424243,
    'f': 0.17777777313333343},
   'rouge-l': {'r': 0.30612244897959184,
    'p': 0.5357142857142857,
    'f': 0.3896103849822905}},
  {'rouge-1': {'r': 0.16326530612244897, 'p': 0.32, 'f': 0.2162162117421476},
   'rouge-2': {'r

In [58]:
rouge_scores_dict = {}
for i in range(len(rouge_scores)):
    rouge_scores_dict[i]= rouge_scores[i]

type(rouge_scores_dict)

dict

In [60]:
rouge_scores_dict[0]

{'rouge-1': {'r': 0.061224489795918366,
  'p': 0.21428571428571427,
  'f': 0.09523809178130524},
 'rouge-2': {'r': 0.017543859649122806,
  'p': 0.07142857142857142,
  'f': 0.028169010918468917},
 'rouge-l': {'r': 0.061224489795918366,
  'p': 0.21428571428571427,
  'f': 0.09523809178130524}}

In [61]:

rouge_scores_df = pd.DataFrame.from_dict(rouge_scores_dict, orient='index')


In [62]:
rouge_scores_df

Unnamed: 0,rouge-1,rouge-2,rouge-l
0,"{'r': 0.061224489795918366, 'p': 0.21428571428...","{'r': 0.017543859649122806, 'p': 0.07142857142...","{'r': 0.061224489795918366, 'p': 0.21428571428..."
1,"{'r': 0.08163265306122448, 'p': 0.266666666666...","{'r': 0.03508771929824561, 'p': 0.133333333333...","{'r': 0.061224489795918366, 'p': 0.2, 'f': 0.0..."
2,"{'r': 0.32653061224489793, 'p': 0.571428571428...","{'r': 0.14035087719298245, 'p': 0.242424242424...","{'r': 0.30612244897959184, 'p': 0.535714285714..."
3,"{'r': 0.16326530612244897, 'p': 0.32, 'f': 0.2...","{'r': 0.03508771929824561, 'p': 0.071428571428...","{'r': 0.14285714285714285, 'p': 0.28, 'f': 0.1..."
4,"{'r': 0.2653061224489796, 'p': 0.0970149253731...","{'r': 0.07017543859649122, 'p': 0.022346368715...","{'r': 0.22448979591836735, 'p': 0.082089552238..."
...,...,...,...
295,"{'r': 0.6428571428571429, 'p': 0.6666666666666...","{'r': 0.559322033898305, 'p': 0.52380952380952...","{'r': 0.6071428571428571, 'p': 0.6296296296296..."
296,"{'r': 0.6428571428571429, 'p': 0.5454545454545...","{'r': 0.5423728813559322, 'p': 0.4, 'f': 0.460...","{'r': 0.6071428571428571, 'p': 0.5151515151515..."
297,"{'r': 0.6607142857142857, 'p': 0.6491228070175...","{'r': 0.5932203389830508, 'p': 0.5384615384615...","{'r': 0.6428571428571429, 'p': 0.6315789473684..."
298,"{'r': 0.2857142857142857, 'p': 0.3265306122448...","{'r': 0.13559322033898305, 'p': 0.129032258064...","{'r': 0.2857142857142857, 'p': 0.3265306122448..."


In [69]:
rouge2 = rouge_scores_df['rouge-2'].apply(pd.Series)

In [70]:
rouge2.describe()

Unnamed: 0,r,p,f
count,300.0,300.0,300.0
mean,0.198613,0.258626,0.206965
std,0.164964,0.174559,0.15355
min,0.0,0.0,0.0
25%,0.074632,0.138093,0.097809
50%,0.159075,0.230769,0.178671
75%,0.260995,0.335366,0.286181
max,0.805556,1.0,0.73913


In [71]:
average_r = rouge2['r'].mean()
average_p = rouge2['p'].mean()
average_f = rouge2['f'].mean()
average_r, average_p, average_f

(0.19861258009846788, 0.25862646516998544, 0.20696501983423318)