In [1]:
import pandas as pd
from collections import defaultdict
import os
import pickle
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import numpy as np
import itertools
import scipy
from statsmodels.stats.power import TTestIndPower

In [2]:
PUBLIC_RELEASE_PATH = "C:/Users/t-johnnywei/Documents/GitHub/ToShipOrNotToShip\public_release"

In [3]:
def load_data(use_cache=True):
    cache_filename = "data.pickle"
    data = defaultdict(dict)
    if use_cache and os.path.isfile(cache_filename):
        with open(cache_filename, 'rb') as handle:
            data = pickle.load(handle)
    else:
        _, campaigns_list, _ = next(os.walk(PUBLIC_RELEASE_PATH))
        counter = 1
        for campaign in campaigns_list:
            if campaign not in data:
                data[campaign] = defaultdict(dict)
            for _, _, systems_list in os.walk(f"{PUBLIC_RELEASE_PATH}/{campaign}"):
                for system in systems_list:
                    if system not in data[campaign]:
                        data[campaign][system] = defaultdict(dict)
                    print(f"Loading {counter}/{len(campaigns_list)} campaign")
                    xls = pd.ExcelFile(f"{PUBLIC_RELEASE_PATH}/{campaign}/{system}")
                    for datatype in xls.sheet_names:
                        if datatype in ["hum_annotations",
                                        "full_test"]:
                            data[campaign][system][datatype] = pd.read_excel(
                                xls, datatype)
                        else:
                            df = pd.read_excel(xls, datatype)
                            # transform to dictionary
                            df_dict = df.set_index("Unnamed: 0").transpose()
                            df_dict = df_dict.iloc[0].to_dict()
                            data[campaign][system][datatype] = df_dict
                counter += 1

        # save the cache data
        if use_cache:
            with open(cache_filename, 'wb') as handle:
                pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Annotated data loaded")
    return data

In [4]:
data = load_data()

Annotated data loaded


In [5]:
def pairs():
    for (k, v) in data.items():
        for i, j in itertools.combinations(v, 2):
            yield (v[i]['hum_annotations'], v[j]['hum_annotations'])

In [6]:
lps = '''ENU	FRA
ENU	DEU
FRA	ENU
DEU	ENU
JPN	ENU
ENU	JPN
ITA	ENU
CHS	ENU
ENU	PTB
ENU	SVE
ENU	ITA
ENU	DAN
ENU	PLK
ARA	ENU
ENU	CHS
IND	ENU
PLK	ENU
PTB	ENU
ESN	ENU
HUN	ENU
ENU	KOR
CSY	ENU
ENU	HIN
NLD	ENU
KOR	ENU
ENU	ARA
ENU	IND
ENU	NLD
ENU	CSY
TRK	ENU
ENU	THA
SVE	ENU
DAN	ENU
ENU	HUN
ENU	TRK
ENU	ESN
HIN	ENU
RUS	ENU
THA	ENU
ENU	RUS'''

In [7]:
def test_all(pairs, test='wilcoxon'):
    results = []
    for df1, df2 in pairs:
        diff = df1['Score'].mean() - df2['Score'].mean()
        try:
            if test == 'wilcoxon':
                # if len(df1) != len(df2), that means there is some repeat or lost sentences
                # look in Tom's code
                
                s, pvalue = scipy.stats.wilcoxon(df1['Score'].head(len(df2)), df2['Score'].head(len(df1)))
            elif test == 'mannwhitneyu':
                s, pvalue = scipy.stats.mannwhitneyu(df1['Score'], df2['Score'])
            elif test == 'ttest_ind':
                s, pvalue = scipy.stats.ttest_ind(df1['Score'].head(len(df2)), df2['Score'].head(len(df1)))
            results.append((diff, pvalue))
        except Exception as e:
            results.append((0, 0.))
            print('error', e)
    return np.array(results)

In [14]:
for line in lps.split('\n'):
    records = []
    lp_source, lp_target = tuple(line.split('\t'))
    records.append(lp_source)
    records.append(lp_target)
    
    lp_pairs = []
    for i, j in pairs():
        assert(i['Source'].unique() == j['Source'].unique())
        source = i['Source'].unique()
        assert(len(source) == 1)

        assert(i['Target'].unique() == j['Target'].unique())
        target = i['Target'].unique()
        assert(len(target) == 1)

        if source[0] == lp_source and target[0] == lp_target:
            lp_pairs.append((i, j))

    means = []
    for i, j in lp_pairs:
        means.append(i['Score'].mean())
        means.append(j['Score'].mean())
    records.append(np.mean(means))

    stddevs = []
    for i, j in lp_pairs:
        stddevs.append(i['Score'].std())
        stddevs.append(j['Score'].std())
    records.append(np.mean(stddevs))
    
    results = test_all(lp_pairs, test='mannwhitneyu')
    records.append(len(results))
    records.append(np.sum(results[:,1] < 0.05))
    
    order = results[:, 0].argsort()
    sorted_results = results[order]

    p_test = 0.05
    threshold = 0.95
    sig, insig = 0, 0
    last_diff = 0
    for diff, power in sorted_results:
        if power < 0.05:
            sig += 1
        else:
            insig += 1

        if sig / (sig + insig) < threshold:
            break
        last_diff = diff
    
    records.append(sig + insig)
    records.append(np.abs(last_diff))
    
    print('\t'.join([str(i) for i in records ]))

ENU	FRA	90.2944515816907	18.24664511172684	153	30	12	3.356028680976266
ENU	DEU	91.99174926180744	14.313352089031278	151	19	2	3.5540559382666004
FRA	ENU	92.47588423965051	13.612181541453348	140	3	2	2.3981481481481524
DEU	ENU	91.65068250646254	14.927009713545386	130	27	21	1.6799116997792538
JPN	ENU	70.86299938309037	22.90690183714039	127	78	32	3.220665499124337
ENU	JPN	74.44727258576047	21.982557300912102	94	40	12	4.480153459528154
ITA	ENU	88.31431698780304	13.25638260289833	81	2	3	2.8288683850327345
CHS	ENU	79.39405772082303	15.501551770980456	78	30	16	2.3196242826659557
ENU	PTB	92.39283645280048	11.026530154692553	74	28	3	4.705882352941174
ENU	SVE	85.32752367548807	19.834717905201533	73	31	9	2.1598654221836284
ENU	ITA	89.16265450948461	13.866297533053874	72	14	5	3.148196590139719
ENU	DAN	81.17120631199288	18.589673563382313	72	24	11	1.7514704106280163
ENU	PLK	74.66834414297048	25.557977615077935	71	54	25	2.2176255230125435
ARA	ENU	80.36705116543119	15.908248414002884	71	3	4	2.259554140