# Test metrics on Toy Example

In [62]:
# FUNCTIONS / CLASSES :
import sys
from tqdm import tqdm
%load_ext autoreload
%autoreload 2
sys.path.append("./code")
from metrics import METEOR, BLEU, NIST, WACC
from data_processing import WMT22

metrics = [METEOR, BLEU, NIST, WACC]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
reference = "Je me présente je m'appelle Henri"
good_candidate = "Je me présente je me prénomme Henri"
avg_candidate = "Bonjour mon nom est Henri"
bad_candidate = "J'aime les pizzas"

## 1 - METEOR

In [47]:
print(METEOR(good_candidate, reference))
print(METEOR(avg_candidate, reference))
print(METEOR(bad_candidate, reference))

0.5391304347826087
0.09803921568627452
0.0


## 2 - BLEU

In [48]:
print(BLEU(good_candidate, reference))
print(BLEU(avg_candidate, reference))
print(BLEU(bad_candidate, reference))

0.7054014205932617
0.1666666567325592
0.0


## 3 - NIST

In [49]:
print(NIST(good_candidate, reference))
print(NIST(avg_candidate, reference))
print(NIST(bad_candidate, reference))

1.9656602059902528
0.3869880158145604
0.0


## 4 - WAcc

In [50]:
print(WACC(good_candidate, reference))
print(WACC(avg_candidate, reference))
print(WACC(bad_candidate, reference))

0.7142857015132904
0.0
-1.0


## Test on WMT Data

In [69]:
df = WMT22.read_data()

In [70]:
sample = df.sample(1000)
sample.sample(2)

Unnamed: 0,seg_id,sys,hyp,domain,doc,source,ref,score,sltl
29200,911,QUARTZ_TuneReranking,Возврат товара должен быть произведен в течени...,ecommerce,ecommerce_en_24,Returns must be within 30 days of delivery in ...,Возвраты осуществляются в течение 30 дней с мо...,0.0,enru
1085,1102,bleu_bestmbr,"Через несколько недель после того, как я верну...",social,t1_hoh4p6r,"A few weeks after I returned home, I received ...","Спустя несколько недель после того, как я верн...",-3.0,enru


In [78]:
for metric in tqdm(metrics):
    
    sample[str(metric.__name__)] = sample.apply(lambda x : metric(
        reference = x.ref, 
        candidate = x.hyp,), axis = 1)

100%|█████████████████████████████████████████████| 4/4 [00:01<00:00,  2.94it/s]


In [79]:
sample[["score"] + [str(metric.__name__) for metric in metrics]].corr(method = "spearman")

Unnamed: 0,score,METEOR,BLEU,NIST,WACC
score,1.0,0.129131,0.107183,0.035658,0.102204
METEOR,0.129131,1.0,0.876642,0.72835,0.762269
BLEU,0.107183,0.876642,1.0,0.777584,0.889027
NIST,0.035658,0.72835,0.777584,1.0,0.636838
WACC,0.102204,0.762269,0.889027,0.636838,1.0


In [81]:
sample[["score"] + [str(metric.__name__) for metric in metrics]].corr(method = "kendall")

Unnamed: 0,score,METEOR,BLEU,NIST,WACC
score,1.0,0.100935,0.083766,0.02728,0.079863
METEOR,0.100935,1.0,0.710016,0.538894,0.590097
BLEU,0.083766,0.710016,1.0,0.593039,0.732003
NIST,0.02728,0.538894,0.593039,1.0,0.460533
WACC,0.079863,0.590097,0.732003,0.460533,1.0
