In [1]:
import nltk
from nltk import bigrams
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
from nltk.corpus import wordnet as wn

import re
import itertools
import random
import pickle
import pandas as pd
from tqdm import tqdm

# WMT16 and WMT17 segment-level data 

In [2]:
wmt16 = pickle.load(open('../wmt16-19-metrics-shared-task/data/pickles/wmt16-seg_level-agg.pkl', 'rb'))
wmt16['year'] = len(wmt16)*[2016]

wmt17 = pickle.load(open('../wmt16-19-metrics-shared-task/data/pickles/wmt17-seg_level-agg.pkl', 'rb'))
wmt17['year'] = len(wmt17)*[2017]

wmt_mst_seg = pd.concat([wmt16, wmt17])
wmt_mst_seg_toen = wmt_mst_seg[wmt_mst_seg.lp.str.endswith('de-en') & (wmt_mst_seg.year == 2016)].copy()

In [3]:
wmt_mst_seg.head(5)

Unnamed: 0,lp,system,sid,sentBLEU,score,output,reference,source,year
0,en-ru,jhu-pbmt,1092,0.273012,0.363122,43 закусочных нарушил требования к организации...,43 закусочных нарушили требования к устройству...,43 eateries violated requirements for the orga...,2016
1,en-ru,online-G,750,0.076668,-0.450232,"Учитывая, что он представляет собой сугубо пра...","Кажется маловероятным, что Кэмерон, будучи, по...","Given he is an avowedly hands-on parent, it se...",2016
2,en-ru,AFRL-MITLL-phrase-based,2786,0.252464,0.113451,Печать процессы глобализации должны помогли ум...,Распечатать Процессы глобализации должны были ...,Printing the Processes of Globalisation should...,2016
3,en-ru,LIMSI,250,0.531697,-0.257524,"Это нечто значительное, странное или необычное...","Что-то важное, странное или необычное происход...","Is something significant, bizarre or unusual h...",2016
4,en-ru,AFRL-MITLL-phrase-based,88,0.097414,-0.695001,"В то время как сами праздничные дни месяца, в ...","Хотя до праздников еще несколько месяцев, сезо...","While the holidays themselves are months away,...",2016


In [4]:
reference = wmt_mst_seg[wmt_mst_seg.lp == 'de-en'].iloc[8]['reference'].strip()
hypothesis = wmt_mst_seg[wmt_mst_seg.lp == 'de-en'].iloc[8]['output'].strip()
print(reference.strip())
print(hypothesis)

through minor borders on the Austrian side, says a spokesperson for the Federal police.
Out on the Austrian side, a spokesman for the federal police said.


# Level sets

### Reference response

In [5]:
def identity(hypothesis):
    return hypothesis

### Removing punctuation

In [6]:
def remove_punct(hypothesis):
    return re.sub(r'[^\w\s]', '', hypothesis)

remove_punct(hypothesis)

'Out on the Austrian side a spokesman for the federal police said'

### Simplifying response

In [7]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
d = TreebankWordDetokenizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /nas/home/jwei/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /nas/home/jwei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def get_pos(hypothesis):
    return nltk.pos_tag(word_tokenize(hypothesis))

def remove_all_modifiers(hypothesis):
    tags = get_pos(hypothesis)
    bad_tags = ['JJ', 'PRP$', 'RB']
    retained = [ word for word, tag in tags if tag not in bad_tags]
    return d.detokenize(retained)

remove_all_modifiers(hypothesis)

'Out on the side, a spokesman for the police said.'

### Synonyms

In [9]:
nltk.download('wordnet')

def replace_synonyms(hypothesis):
    tags = get_pos(hypothesis)
    
    adv_hypothesis = word_tokenize(hypothesis).copy()
    for i, (tok, tag) in enumerate(tags):
        if tag == 'JJ': 
            pos = wn.ADJ
        elif tag == 'RB':
            pos = wn.ADV
        elif tag == 'N':
            pos = wn.NOUN
        else:
            continue
            
        synset = wn.synsets(tok, pos=pos)
        if len(synset) > 0:
            lemmas = [ i.lemma_names() for i in synset ]
            adv_hypothesis[i] = random.choice(random.choice(lemmas))
            
    return d.detokenize(adv_hypothesis)

replace_synonyms(hypothesis)

[nltk_data] Downloading package wordnet to /nas/home/jwei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'Out on the Austrian side, a spokesman for the federal police said.'

# Attacks

### Removing NLTK stopwords

In [10]:
nltk.download('stopwords')
sw = set(stopwords.words("english"))
sw_25 = set(stopwords.words("english")[:25])

[nltk_data] Downloading package stopwords to
[nltk_data]     /nas/home/jwei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def remove_all_stopwords(hypothesis):
    toks = word_tokenize(hypothesis)
    return d.detokenize([ i for i in toks if i.lower() not in sw ])

print(reference)
print(remove_all_stopwords(reference))

through minor borders on the Austrian side, says a spokesperson for the Federal police.
minor borders Austrian side, says spokesperson Federal police.


In [12]:
def remove_top25_stopwords(hypothesis):
    toks = word_tokenize(hypothesis)
    return d.detokenize([ i for i in toks if i.lower() not in sw_25 ])

print(reference)
print(remove_all_stopwords(reference))

through minor borders on the Austrian side, says a spokesperson for the Federal police.
minor borders Austrian side, says spokesperson Federal police.


### Order perturbation

In [13]:
def jumble(hypothesis):
    adv_hypothesis = word_tokenize(hypothesis).copy()
    random.shuffle(adv_hypothesis)
    return d.detokenize(adv_hypothesis)

jumble(hypothesis)

'said Austrian . the on a spokesman, police for federal the Out side'

In [14]:
def reverse(hypothesis):
    adv_hypothesis = word_tokenize(hypothesis)
    return d.detokenize(adv_hypothesis[::-1])

reverse(hypothesis)

'. said police federal the for spokesman a, side Austrian the on Out'

# Output

In [15]:
level_sets = ['identity', 'remove_punct', 'remove_all_modifiers', 'replace_synonyms']

for func_name in tqdm(level_sets):
    func = globals()[func_name]
    wmt_mst_seg_toen['level_ref:%s' % func_name] = wmt_mst_seg_toen.reference.apply(lambda x: func(x))
    wmt_mst_seg_toen['level_out:%s' % func_name] = wmt_mst_seg_toen.output.apply(lambda x: func(x))

100%|██████████| 4/4 [00:05<00:00,  1.26s/it]


In [16]:
attacks = ['remove_all_stopwords', 'remove_top25_stopwords', 'jumble', 'reverse']

for func_name in tqdm(attacks):
    func = globals()[func_name]
    wmt_mst_seg_toen['attack_ref:%s' % func_name] = wmt_mst_seg_toen.reference.apply(lambda x: func(x))
    wmt_mst_seg_toen['attack_out:%s' % func_name] = wmt_mst_seg_toen.output.apply(lambda x: func(x))

100%|██████████| 4/4 [00:01<00:00,  2.30it/s]


In [17]:
wmt_mst_seg_toen

Unnamed: 0,lp,system,sid,sentBLEU,score,output,reference,source,year,level_ref:identity,...,level_ref:replace_synonyms,level_out:replace_synonyms,attack_ref:remove_all_stopwords,attack_out:remove_all_stopwords,attack_ref:remove_top25_stopwords,attack_out:remove_top25_stopwords,attack_ref:jumble,attack_out:jumble,attack_ref:reverse,attack_out:reverse
3360,de-en,jhu-syntax,677,0.158512,0.691142,At the moment the men predominate among the st...,Currently the majority of staff are men.\n,Im Augenblick überwiegen bei den Mitarbeitern ...,2016,Currently the majority of staff are men.\n,...,presently the majority of staff are men.,At the moment the men predominate among the st...,Currently majority staff men.,moment men predominate among staff.,Currently the majority of staff are men.,At the moment the men predominate among the st...,the are Currently men majority staff of.,men moment the the . among predominate At staf...,. men are staff of majority the Currently,. staff the among predominate men the moment t...
3361,de-en,jhu-pbmt,1092,0.421134,-0.762188,"A good prank is funny, but it takes only momen...","A good prank is funny, but takes moments to re...","Ein guter Streich ist lustig, aber es dauert n...",2016,"A good prank is funny, but takes moments to re...",...,"A estimable prank is queer, but takes moments ...","A good prank is fishy, but it takes solely mom...","good prank funny, takes moments reverse.","good prank funny, takes moments becomes boomer...","A good prank is funny, but takes moments to re...","A good prank is funny, but it takes only momen...","takes, is good . A to reverse moments but pran...","becomes moments before A funny is, takes it a ...",". reverse to moments takes but, funny is prank...",. boomerang a becomes he before moments only t...
3362,de-en,online-B,1375,0.505926,-0.599309,Threatening is above them at one end of the or...,Looming over them at one end of the central ba...,Drohend über ihnen an einem Ende des verzierte...,2016,Looming over them at one end of the central ba...,...,Looming over them at one end of the fundamenta...,Threatening is above them at one end of the or...,"Looming one end central bank's ornate, two-sto...","Threatening one end ornate, two-story courtroo...",Looming over them at one end of the central ba...,Threatening is above them at one end of the or...,Looming mural's of boardroom ornate over one b...,". at is a two-story end, United the one States...",. States United the of mural a be will boardro...,. States United the of mural a courtroom two-s...
3363,de-en,online-G,616,0.178942,-0.305292,"You see, why this program is not often discuss...",Can you see why that programme is often discus...,"Sehen Sie ein, warum dieses Programm oft disku...",2016,Can you see why that programme is often discus...,...,Can you see why that programme is often discus...,"You see, why this program is non ofttimes disc...",see programme often discussed original particu...,"see, program often discussed, original, partic...",Can see why that programme is often discussed ...,"see, why this program is not often discussed, ...",? Can original discussed because is it is is s...,"not original ,? program, and why it is, not is...",? distinctive particularly not is it and origi...,"? striking particularly not and, original is i..."
3364,de-en,jhu-syntax,679,0.547704,0.362249,"Mrs Laury said: ""It is still a majority of men...","Mrs Laury said: ""We still have a majority of m...","Mrs Laury sagte: ""Es arbeitet immer noch eine ...",2016,"Mrs Laury said: ""We still have a majority of m...",...,Mrs Laury said:``We still have a majority of m...,Mrs Laury said:``It is nonetheless a majority ...,Mrs Laury said:``still majority men working st...,Mrs Laury said:``still majority men stores.,Mrs Laury said:``still have a majority of men ...,Mrs Laury said:``It is still a majority of men...,Laury of working our We a have said majority``...,Laury of still It is stores in: our a majority...,. stores our in working men of majority a have...,. stores our in men of majority a still is It ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,de-en,uedin-syntax,2921,0.316149,-0.893829,"It is fashionable for the first, tentative ste...","He represents the designer's first, tentative ...","Er steht für die ersten, zaghaften modischen G...",2016,"He represents the designer's first, tentative ...",...,"He represents the designer's beginning, tentat...","It is fashionable for the first, doubtful step...","represents designer's first, tentative steps f...","fashionable first, tentative steps designer.","represents the designer's first, tentative ste...","It is fashionable for the first, tentative ste...","the designer represents steps's first, He tent...",of It for tentative fashionable the designer f...,". fashion at steps tentative, first's designer...",". designer the of steps tentative, first the f..."
3916,de-en,KIT,2090,0.772290,1.019740,"If you want to know more about the case, pleas...","If you want to know more about the case, pleas...","Wenn Sie mehr über den Fall wissen wollen, hör...",2016,"If you want to know more about the case, pleas...",...,"If you want to know more about the case, pleas...","If you want to know more about the case, pleas...","want know case, please listen series podcasts'...","want know case, please listen series podcasts,...","If want to know more about the case, please li...","If want to know more about the case, please li...",If the series . the podcasts to made about've ...,"have want the I of you, know listen If case . ...",. made've I podcasts of series the to listen p...,". made have I which, podcasts of series the to..."
3917,de-en,KIT,2158,0.526244,0.934524,"The cyclist, a 40-year-old man from Cobram, re...","The cyclist, a 40-year-old Cobram man, remains...","Der Radfahrer, ein 40-jähriger Mann aus Cobram...",2016,"The cyclist, a 40-year-old Cobram man, remains...",...,"The cyclist, a 40-year-old Cobram man, remains...","The cyclist, a 40-year-old man from Cobram, re...","cyclist , 40-year-old Cobram man, remains hosp...","cyclist , 40-year-old man Cobram, remains hosp...","The cyclist, a 40-year-old Cobram man, remains...","The cyclist, a 40-year-old man from Cobram, re...",". The a, remains hospital in in condition cycl...",condition stable remains in Cobram he and a in...,". condition stable a in hospital in remains, m...",. condition stable a in is he and hospital in ...
3918,de-en,KIT,2097,0.292517,0.867003,"And from this perspective, I will see him die.\n",And it's from this perspective that I will wat...,Und aus dieser Perspektive werde ich ihn sterb...,2016,And it's from this perspective that I will wat...,...,And it's from this perspective that I will wat...,"And from this perspective, I will see him die.",'s perspective watch die.,"perspective, see die.",And it's from this perspective that will watch...,"And from this perspective, will see die.",from I that it him die's . perspective will An...,"perspective him this I die And, will from see.",. die him watch will I that perspective this f...,". die him see will I, perspective this from And"


In [18]:
pickle.dump(wmt_mst_seg_toen, open('wmt_mst_seg_toen.pkl', 'wb'))