In [1]:
import pandas as pd
from collections import defaultdict
import os
import pickle
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import numpy as np
import itertools
import scipy
from statsmodels.stats.power import TTestIndPower
from collections import Counter

In [2]:
PUBLIC_RELEASE_PATH = "C:/Users/t-johnnywei/Documents/GitHub/ToShipOrNotToShip\public_release"

In [3]:
def load_data(use_cache=True):
    cache_filename = "data.pickle"
    data = defaultdict(dict)
    if use_cache and os.path.isfile(cache_filename):
        with open(cache_filename, 'rb') as handle:
            data = pickle.load(handle)
    else:
        _, campaigns_list, _ = next(os.walk(PUBLIC_RELEASE_PATH))
        counter = 1
        for campaign in campaigns_list:
            if campaign not in data:
                data[campaign] = defaultdict(dict)
            for _, _, systems_list in os.walk(f"{PUBLIC_RELEASE_PATH}/{campaign}"):
                for system in systems_list:
                    if system not in data[campaign]:
                        data[campaign][system] = defaultdict(dict)
                    print(f"Loading {counter}/{len(campaigns_list)} campaign")
                    xls = pd.ExcelFile(f"{PUBLIC_RELEASE_PATH}/{campaign}/{system}")
                    for datatype in xls.sheet_names:
                        if datatype in ["hum_annotations",
                                        "full_test"]:
                            data[campaign][system][datatype] = pd.read_excel(
                                xls, datatype)
                        else:
                            df = pd.read_excel(xls, datatype)
                            # transform to dictionary
                            df_dict = df.set_index("Unnamed: 0").transpose()
                            df_dict = df_dict.iloc[0].to_dict()
                            data[campaign][system][datatype] = df_dict
                counter += 1

        # save the cache data
        if use_cache:
            with open(cache_filename, 'wb') as handle:
                pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Annotated data loaded")
    return data

In [4]:
data = load_data()

Annotated data loaded


In [5]:
def pairs():
    for (k, v) in data.items():
        for i, j in itertools.combinations(v, 2):
            yield (v[i]['hum_annotations'], v[j]['hum_annotations'])

In [6]:
# get language pairs
lps = []
for i, j in pairs():
    source, target = list(i['Source'].unique())[0], list(i['Target'].unique())[0]
    lps.append((source, target))

st = []
for (s, t), c in Counter(lps).most_common(10):
    st.append((s,t))

st

[('ENU', 'FRA'),
 ('ENU', 'DEU'),
 ('FRA', 'ENU'),
 ('DEU', 'ENU'),
 ('JPN', 'ENU'),
 ('ENU', 'JPN'),
 ('ITA', 'ENU'),
 ('CHS', 'ENU'),
 ('ENU', 'PTB'),
 ('ENU', 'SVE')]

In [7]:
def mde(sorted_results):
    p_test = 0.05
    threshold = 0.95
    sig, insig = 0, 0
    last_diff = 0
    for diff, pvalue in sorted_results:
        if pvalue < 0.05:
            sig += 1
        else:
            insig += 1

        if sig / (sig + insig) < threshold:
            return last_diff, sig, insig
        last_diff = diff
        
    # no mde found
    return None, sig, insig

In [8]:
for source, target in st:
    records = []
    records.append('%s $\\rightarrow$ %s' % (source, target))
    
    lp_pairs = []
    for i, j in pairs():
        s = i['Source'].unique()
        t = i['Target'].unique()

        if source == s and target == t:
            lp_pairs.append((i, j))
    
    results = []
    for df1, df2 in lp_pairs:
        diff = np.abs(df1['Score'].mean() - df2['Score'].mean())
        s, pvalue = scipy.stats.mannwhitneyu(df1['Score'], df2['Score'])
        results.append((diff, pvalue))
    results = np.array(results)
    records.append('%d / %d' % (np.sum(results[:, 1] < 0.05), len(results)))
    
    order = (-results[:, 0]).argsort()
    sorted_results = results[order]
    obs_mde, sig, insig = mde(sorted_results)
    records.append('%.1f' % obs_mde)
    
    # get 25th percentile
    q_diff = sorted_results[int(len(sorted_results) * 0.5), 0]
    records.append('%.1f' % q_diff)
    
    print('\t'.join([str(i) for i in records ]))

ENU $\rightarrow$ FRA	30 / 153	3.8	1.2
ENU $\rightarrow$ DEU	19 / 151	3.5	0.7
FRA $\rightarrow$ ENU	3 / 140	2.4	0.6
DEU $\rightarrow$ ENU	27 / 130	1.9	0.6
JPN $\rightarrow$ ENU	78 / 127	2.9	3.2
ENU $\rightarrow$ JPN	40 / 94	3.8	1.8
ITA $\rightarrow$ ENU	2 / 81	2.8	0.5
CHS $\rightarrow$ ENU	30 / 78	2.6	1.5
ENU $\rightarrow$ PTB	28 / 74	1.0	0.6
ENU $\rightarrow$ SVE	31 / 73	4.4	1.4


In [10]:
for source, target in st:
    records = []
    records.append('%s $\\rightarrow$ %s' % (source, target))
    
    lp_pairs = []
    for i, j in pairs():
        s = i['Source'].unique()
        t = i['Target'].unique()

        if source == s and target == t:
            lp_pairs.append((i, j))
    
    results = []
    for df1, df2 in lp_pairs:
        results.append(df1['Score'].mean())
        results.append(df2['Score'].mean())
    records.append('%f' % np.mean(results))
    
    results = []
    for df1, df2 in lp_pairs:
        results.append(df1['Score'].std())
        results.append(df2['Score'].std())
    records.append('%f' % np.mean(results))
    
    print('\t'.join([str(i) for i in records ]))

ENU $\rightarrow$ FRA	90.294452	18.246645
ENU $\rightarrow$ DEU	91.991749	14.313352
FRA $\rightarrow$ ENU	92.475884	13.612182
DEU $\rightarrow$ ENU	91.650683	14.927010
JPN $\rightarrow$ ENU	70.862999	22.906902
ENU $\rightarrow$ JPN	74.447273	21.982557
ITA $\rightarrow$ ENU	88.314317	13.256383
CHS $\rightarrow$ ENU	79.394058	15.501552
ENU $\rightarrow$ PTB	92.392836	11.026530
ENU $\rightarrow$ SVE	85.327524	19.834718
