In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from glob import glob
import json

### Note

1. getting the connective string from the feature names is the best way to ensure that the connective string label for the datapoint is accurate. 
2. we have a feature that is Prev_C_ that contains the connective string. we can use a few pandas filter tricks to get the connective string. for each filtered df for a connective, we can sum each row and if it sums to more than 1, it means that the connective is present in this datapoint. 
3. the problem is that we did one-hot encoding during the feature building step, and in order to avoid multi-collinearity, we dropped the first column. this means that some of the connectives would not be reflected here. 
4. fortunately we have the connective stored before shuffling. and we only shuffled before the train step. this way we can easily get the connective string for each datapoint. 
5. possible time wasted doing steps 1 to 3 above. but now since we have these, we can easily go 1 more step further to verify that the stored connective string is mapped correctly to datapoint. 
6. with this step we have full confidence in our subsequent analysis of the per-connective error rate. 

In [2]:
# experiments = ["PTB_Auto_Run2", "PTB_Gold_Run2", "UD1_Auto_Run2", "UD1_Gold_Run2"]
experiments = ['UD1_Auto_Run2']

In [7]:
FEATNAMEPATH = '../02_output/en/'
for exp in experiments:
    __datalist = glob("results/{}_L_16*".format(exp))
    __featlist = glob(FEATNAMEPATH+exp+"/Li_etal16/*.json")
    if not __featlist:  # slight issue with folder naming convention further upstream
        __featlist = glob(FEATNAMEPATH+exp+"/Li_etal16_UD1/*.json")

    
    for set_ in ['train', 'test']:
        X_fn = [i for i in __datalist if "X_"+set_ in i][0]

        y_fn = [i for i in __datalist if "y_{}.".format(set_) in i][0]

        y_pred_fn = [i for i in __datalist if "y_{}_".format(set_) in i][0]
        
        col_fn = [i for i in __featlist if set_ in i][0]
        
        cstring = [i for i in __datalist if "cstring_"+set_ in i][0]
        
        print(X_fn)
        print(y_fn)
        print(y_pred_fn)
        print(col_fn)
        print(cstring, '\n')

        # load the X file that is saved in npz format
        X = sparse.load_npz(X_fn)
        
        with open(col_fn) as f:
            cols_dict = json.load(f)
        
        # do the same with y and y_pred labels
        y_labels = np.load(y_fn)
        y_pred_labels = np.load(y_pred_fn)
        
        # load the connective strings
        with open (cstring) as f:
            cstrings = [i.strip('\n') for i in f.readlines()]
        
        
        df = pd.DataFrame(X.todense(), columns=cols_dict.values())
        # the Previous + Connective feature in Li et al 16 has the connective string we want
        df = df.filter(regex=r'Prev_C_', axis=1).copy()
        
        connectives = set([i.split('^')[-1] for i in df.columns])
        
        results = pd.DataFrame(index=range(len(df)))
        for conn in connectives:
            # filter for cols with the connective in them
            _df = df.filter(regex=r'\^{}'.format(conn), axis=1).copy()
            # sum each row, replace with the value of 1 if connective present, else 0
            _df[conn] = _df.apply(lambda x: 1 if sum(x)>0 else 0, axis=1)
            # join
            results = results.join(_df[conn], how='left')
        
        # results = pd.DataFrame(cstrings, columns = ['cstring'])
        results = results.join(pd.DataFrame(y_labels, columns=['y_labels']), how='left')
        results = results.join(pd.DataFrame(y_pred_labels, columns=['y_pred_labels']), how='left')
        results = results.join(pd.DataFrame(cstrings, columns = ['cstring']), how='left')


        globals()['df_' + exp + '_' + set_] = results.copy()
    
    

results/UD1_Auto_Run2_L_16_X_train.npz
results/UD1_Auto_Run2_L_16_y_train.npy
results/UD1_Auto_Run2_L_16_y_train_pred.npy
../02_output/en/UD1_Auto_Run2/Li_etal16_UD1/train_featnames.json
results/UD1_Auto_Run2_L_16_cstring_train.txt 

results/UD1_Auto_Run2_L_16_X_test.npz
results/UD1_Auto_Run2_L_16_y_test.npy
results/UD1_Auto_Run2_L_16_y_test_pred.npy
../02_output/en/UD1_Auto_Run2/Li_etal16_UD1/test_featnames.json
results/UD1_Auto_Run2_L_16_cstring_test.txt 



In [47]:
for conn in connectives:
    # these are the columns we want 
    cols = [conn, 'y_labels','y_pred_labels','cstring']
    # these are the conditions
    # 1. the value in the cstring column is not the connective string of interest
    cond1 = df_UD1_Auto_Run2_test['cstring']!=conn
    # 2. the value of the column for the connective is more than 1
    cond2 = df_UD1_Auto_Run2_test[conn] > 1
    
    # putting these two conditions together, if our cstring labels are correct
    # there should be no datapoints that meet these conditions. i.e. empty result
    result = df_UD1_Auto_Run2_test[cond1 & cond2][cols]
    if len(result) > 1:
        print(result)

    

Empty DataFrame
Columns: [before and after, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [in sum, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [afterward, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [instead, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [accordingly, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [before, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [nevertheless, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [if and when, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [lest, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [additionally, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [next, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [then, y_labels, y_pred_labels, cstring]
Index: []
Empty DataFrame
Columns: [largely 