# Test metrics on Toy Example

In [91]:
# FUNCTIONS / CLASSES :
import sys
from tqdm import tqdm
import pandas as pd
%load_ext autoreload
%autoreload 2
sys.path.append("./code")
from metrics import METEOR, BLEU, WACC, CHRF, TER, compute_metrics
from data_processing import WMT22, dataset

metrics = [METEOR, BLEU, WACC, CHRF, TER]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [86]:
reference = "Je me présente je m'appelle Henri"
good_candidate = "Je me présente je me prénomme Henri"
avg_candidate = "Bonjour mon nom est Henri"
bad_candidate = "J'aime les pizzas"

In [100]:
for metric in metrics:
    print("METRIC : ", str(metric.__name__))
    print("good candidate : ", metric(reference = reference, candidate = good_candidate))
    print("avg candidate : ", metric(reference = reference, candidate = avg_candidate))
    print("bad candidate : ", metric(reference = reference, candidate = bad_candidate))

METRIC :  METEOR
good candidate :  0.6098360655737705
avg candidate :  0.0847457627118644
bad candidate :  0.0
METRIC :  BLEU
good candidate :  0.7142857313156128
avg candidate :  0.16374613344669342
bad candidate :  0.0
METRIC :  SACRE_BLEU
good candidate :  0.43472087383270264
avg candidate :  0.0
bad candidate :  0.0
METRIC :  NIST
good candidate :  1.846401786229397
avg candidate :  0.4493840682593886
bad candidate :  0.0
METRIC :  WACC
good candidate :  0.6666666567325592
avg candidate :  0.1666666865348816
bad candidate :  0.0
METRIC :  CHRF
good candidate :  0.6306954117328964
avg candidate :  0.15168410023582476
bad candidate :  0.08836230975871608
METRIC :  TER
good candidate :  0.6666666567325592
avg candidate :  0.1666666865348816
bad candidate :  0.0


## Test on WMT Data

In [98]:
df = WMT22.read_data()

In [99]:
sample = df.sample(1000)
sample.sample(2)

Unnamed: 0,seg_id,sys,hyp,domain,doc,source,ref,score,sltl
29843,246,Online-Y,"Recently, an emergency release platform for in...",news,news_xinhua-zh-01.104266:Chinese-English,近日，公安部刑事侦查局打拐办开发的儿童失踪信息紧急发布平台上线。,"Recently, the platform of emergency release of...",0.0,zhen
23434,1291,Online-A,"Please ask, what is the order situation now?",conversation,conversation_en_zh-TW_CLIENT-05_2020-12-13-19_...,请问，订单情况现在是什么样,May I ask what the status of the order is now?,-6.0,zhen


In [101]:
for metric in tqdm(metrics):
    
    sample[str(metric.__name__)] = sample.apply(lambda x : metric(
        reference = x.ref, 
        candidate = x.hyp,), axis = 1)

100%|██████████| 7/7 [00:23<00:00,  3.29s/it]


In [102]:
sample[["score"] + [str(metric.__name__) for metric in metrics]].corr(method = "spearman")

Unnamed: 0,score,METEOR,BLEU,SACRE_BLEU,NIST,WACC,CHRF,TER
score,1.0,0.151769,0.141547,0.089533,0.029616,0.145502,0.160688,0.141055
METEOR,0.151769,1.0,0.847539,0.720235,0.676411,0.73999,0.827712,0.785328
BLEU,0.141547,0.847539,1.0,0.713094,0.743387,0.883074,0.792563,0.905119
SACRE_BLEU,0.089533,0.720235,0.713094,1.0,0.716497,0.678674,0.661901,0.677029
NIST,0.029616,0.676411,0.743387,0.716497,1.0,0.602287,0.620638,0.625635
WACC,0.145502,0.73999,0.883074,0.678674,0.602287,1.0,0.694773,0.966769
CHRF,0.160688,0.827712,0.792563,0.661901,0.620638,0.694773,1.0,0.731358
TER,0.141055,0.785328,0.905119,0.677029,0.625635,0.966769,0.731358,1.0


In [28]:
sample[["score"] + [str(metric.__name__) for metric in metrics]].corr(method = "kendall")

Unnamed: 0,score,METEOR,BLEU,NIST,WACC,CHRF
score,1.0,0.138642,0.104777,0.034244,0.101007,0.117543
METEOR,0.138642,1.0,0.703407,0.499913,0.595935,0.716147
BLEU,0.104777,0.703407,1.0,0.574815,0.736617,0.724203
NIST,0.034244,0.499913,0.574815,1.0,0.445493,0.544694
WACC,0.101007,0.595935,0.736617,0.445493,1.0,0.618921
CHRF,0.117543,0.716147,0.724203,0.544694,0.618921,1.0


# Test dataset and dataloader

In [7]:
dataset_WMT22 = dataset(set_name = "WMT22")

In [9]:
dataset_WMT22.__getitem__(0)

('Sie können jederzeit zurückkehren, da unser Chat-Service-Fenster rund um die Uhr geöffnet ist',
 'Sie können jederzeit wiederkommen, da unser Chat-Service-Fenster täglich rund um die Uhr geöffnet ist',
 'conversation',
 1,
 0.0)

In [5]:
metrics_scores = compute_metrics(
    set_name = "WMT22",
    metrics = metrics,
    batch_size = 20,
    path = "./data/metrics_scores.csv"
)

  1%|▏         | 66/4987 [00:06<07:58, 10.29it/s]


KeyboardInterrupt: 

In [9]:
metrics_scores = pd.read_csv(
    "./data/metrics_scores.csv",
    index_col = "gold_score"
)

In [10]:
metrics_scores

Unnamed: 0_level_0,sltl,hyp,ref,domain,seg_id,METEOR,BLEU,NIST,WACC
gold_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.386671,ende,"Sie können jederzeit zurückkehren, da unser Ch...","Sie können jederzeit wiederkommen, da unser Ch...",conversation,1,1.376832,1.642982,1.380845,0.358519
0.386671,ende,"Ich hoffe sehr, dass Sie eine Lösung finden","Ich hoffe wirklich, dass Sie eine Lösung finde...",conversation,2,1.061923,1.260124,0.571660,0.301252
0.386671,ende,"Vielen Dank, dass Sie #PRS_ORG# kontaktiert ha...","Vielen Dank, dass Sie #PRS_ORG# kontaktiert ha...",conversation,3,0.521747,-0.002266,0.475743,0.076767
0.386671,ende,Ich wünsche Ihnen einen schönen Abend.,Ich wünsche Ihnen noch einen schönen Abend.,conversation,4,1.418371,1.604712,0.533203,0.358519
0.117933,ende,Der Iran meldet die niedrigste Anzahl tägliche...,Iran meldet niedrigste Zahl täglicher COVID-19...,news,5,1.167222,0.532886,0.299814,0.100819
...,...,...,...,...,...,...,...,...,...
0.386671,zhen,The exercise achievements have been confirmed ...,The results of the exercise have won recogniti...,news,1871,-1.637440,-1.262939,-1.654093,-0.115648
0.117933,zhen,"Ruan Xiongsheng, the director of the Exercise ...","Ruan Xiongsheng, Director of the Exercise Guid...",news,1872,1.562744,1.725648,2.346062,0.358519
0.386671,zhen,The achievements made from exercise proved spe...,The achievements made from the exercise proved...,news,1873,1.655621,1.913690,2.241715,0.401469
0.386671,zhen,"According to Chen Chunming, leader of Guidance...","According to Chen Chunming, leader of Guidance...",news,1874,1.070302,1.081698,2.083909,0.259562
