# Test metrics on Toy Example

In [103]:
# FUNCTIONS / CLASSES :
import sys
from tqdm import tqdm
import pandas as pd
%load_ext autoreload
%autoreload 2
sys.path.append("./code")
from metrics import METEOR, BLEU, WACC, CHRF, TER, compute_metrics
from data_processing import WMT22, dataset

metrics = [METEOR, BLEU, WACC, CHRF, TER]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
reference = "Je me présente je m'appelle Henri"
good_candidate = "Je me présente je me prénomme Henri"
avg_candidate = "Bonjour mon nom est Henri"
bad_candidate = "J'aime les pizzas"

In [105]:
for metric in metrics:
    print("METRIC : ", str(metric.__name__))
    print("good candidate : ", metric(reference = reference, candidate = good_candidate))
    print("avg candidate : ", metric(reference = reference, candidate = avg_candidate))
    print("bad candidate : ", metric(reference = reference, candidate = bad_candidate))

METRIC :  METEOR
good candidate :  0.6098360655737705
avg candidate :  0.0847457627118644
bad candidate :  0.0
METRIC :  BLEU
good candidate :  0.7142857313156128
avg candidate :  0.16374613344669342
bad candidate :  0.0
METRIC :  WACC
good candidate :  0.6666666567325592
avg candidate :  0.1666666865348816
bad candidate :  0.0
METRIC :  CHRF
good candidate :  0.6306954117328964
avg candidate :  0.15168410023582476
bad candidate :  0.08836230975871608
METRIC :  TER
good candidate :  0.6666666567325592
avg candidate :  0.1666666865348816
bad candidate :  0.0


## Test on WMT Data

In [106]:
df = WMT22.read_data()

In [107]:
sample = df.sample(1000)
sample.sample(2)

Unnamed: 0,seg_id,sys,hyp,domain,doc,source,ref,score,sltl
9415,1392,JDExploreAcademy,"Wenn möglich, sichern Sie alle Bücher oder Dok...",conversation,conversation_en_fr_CLIENT-02_default_2020-12-2...,"If possible, back up any books or documents th...",Sichern Sie nach Möglichkeit alle Bücher oder ...,0.0,ende
7390,1351,eTranslation,У меня есть несколько избранных семейных отнош...,social,t1_hqorx78,"I do have a select few familial relationships,...","У меня есть несколько избранных членов семьи, ...",-15.0,enru


In [108]:
for metric in tqdm(metrics):
    
    sample[str(metric.__name__)] = sample.apply(lambda x : metric(
        reference = x.ref, 
        candidate = x.hyp,), axis = 1)

100%|██████████| 5/5 [00:19<00:00,  3.85s/it]


In [109]:
sample[["score"] + [str(metric.__name__) for metric in metrics]].corr(method = "spearman")

Unnamed: 0,score,METEOR,BLEU,WACC,CHRF,TER
score,1.0,0.143539,0.129894,0.126364,0.148412,0.124266
METEOR,0.143539,1.0,0.884189,0.782927,0.871562,0.834735
BLEU,0.129894,0.884189,1.0,0.896821,0.839349,0.922585
WACC,0.126364,0.782927,0.896821,1.0,0.74942,0.968463
CHRF,0.148412,0.871562,0.839349,0.74942,1.0,0.782937
TER,0.124266,0.834735,0.922585,0.968463,0.782937,1.0


In [110]:
sample[["score"] + [str(metric.__name__) for metric in metrics]].corr(method = "kendall")

Unnamed: 0,score,METEOR,BLEU,WACC,CHRF,TER
score,1.0,0.111506,0.100801,0.098154,0.115328,0.096912
METEOR,0.111506,1.0,0.723389,0.610296,0.703017,0.664192
BLEU,0.100801,0.723389,1.0,0.744026,0.669145,0.783374
WACC,0.098154,0.610296,0.744026,1.0,0.574675,0.898617
CHRF,0.115328,0.703017,0.669145,0.574675,1.0,0.607663
TER,0.096912,0.664192,0.783374,0.898617,0.607663,1.0


# Test dataset and dataloader

In [7]:
dataset_WMT22 = dataset(set_name = "WMT22")

In [9]:
dataset_WMT22.__getitem__(0)

('Sie können jederzeit zurückkehren, da unser Chat-Service-Fenster rund um die Uhr geöffnet ist',
 'Sie können jederzeit wiederkommen, da unser Chat-Service-Fenster täglich rund um die Uhr geöffnet ist',
 'conversation',
 1,
 0.0)

In [111]:
metrics_scores = compute_metrics(
    set_name = "WMT22",
    metrics = metrics,
    batch_size = 20,
    path = "./data/metrics_scores.csv"
)

100%|██████████| 4987/4987 [52:49<00:00,  1.57it/s]


In [112]:
metrics_scores = pd.read_csv(
    "./data/metrics_scores.csv",
    index_col = "gold_score"
)

In [113]:
metrics_scores

Unnamed: 0_level_0,sltl,hyp,ref,domain,seg_id,METEOR,BLEU,WACC,CHRF,TER
gold_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.386671,ende,"Sie können jederzeit zurückkehren, da unser Ch...","Sie können jederzeit wiederkommen, da unser Ch...",conversation,1,1.376832,1.642982,0.358519,0.979707,0.335239
0.386671,ende,"Ich hoffe sehr, dass Sie eine Lösung finden","Ich hoffe wirklich, dass Sie eine Lösung finde...",conversation,2,1.061923,1.260124,0.301252,0.357702,0.278000
0.386671,ende,"Vielen Dank, dass Sie #PRS_ORG# kontaktiert ha...","Vielen Dank, dass Sie #PRS_ORG# kontaktiert ha...",conversation,3,0.521747,-0.002266,0.076767,0.530673,0.101705
0.386671,ende,Ich wünsche Ihnen einen schönen Abend.,Ich wünsche Ihnen noch einen schönen Abend.,conversation,4,1.418371,1.604712,0.358519,1.297007,0.335239
0.117933,ende,Der Iran meldet die niedrigste Anzahl tägliche...,Iran meldet niedrigste Zahl täglicher COVID-19...,news,5,1.167222,0.532886,0.100819,1.134610,0.077665
...,...,...,...,...,...,...,...,...,...,...
0.386671,zhen,The exercise achievements have been confirmed ...,The results of the exercise have won recogniti...,news,1871,-1.637440,-1.262939,-0.115648,-1.791407,-0.138698
0.117933,zhen,"Ruan Xiongsheng, the director of the Exercise ...","Ruan Xiongsheng, Director of the Exercise Guid...",news,1872,1.562744,1.725648,0.358519,1.667488,0.369582
0.386671,zhen,The achievements made from exercise proved spe...,The achievements made from the exercise proved...,news,1873,1.655621,1.913690,0.401469,1.875915,0.378168
0.386671,zhen,"According to Chen Chunming, leader of Guidance...","According to Chen Chunming, leader of Guidance...",news,1874,1.070302,1.081698,0.259562,0.919349,0.265178
