In [278]:
%matplotlib inline
import pickle
import pandas as pd
from IPython.display import display

# Loading 2017 train data

In [279]:
with open('../data/processed/data.pkl', 'rb') as pf:
    objs = pickle.load(pf)

grammar_dic = objs[0]
df_17_train = objs[1]   # using RecResult
df_17_test = objs[2]    # using Transcription

recode = lambda x: 1 if x=='correct' else 0
df_17_train['language'] = df_17_train['language'].apply(recode)
df_17_test['language'] = df_17_test['language'].apply(recode)
df_17_train['meaning'] = df_17_train['meaning'].apply(recode)
df_17_test['meaning'] = df_17_test['meaning'].apply(recode)

# load various types of features

df_17_train_vecsim = pd.read_csv('../data/processed/df17_train_fasttext.csv')
df_17_train_grmerr = pd.read_csv('../data/processed/df17_train_grmerror.csv')
df_17_test_vecsim = pd.read_csv('../data/processed/df17_test_fasttext.csv')
df_17_test_grmerr = pd.read_csv('../data/processed/df17_test_grmerror.csv')

print(df_17_train.head(5))
print(df_17_test.shape)
print(df_17_test_vecsim.shape)
print(df_17_test_grmerr.shape)

     Id                                  Prompt   Wavfile  \
0  5835               Frag: Zimmer für 4 Nächte  5835.wav   
1  5836                      Frag: Junior Suite  5836.wav   
2  5837                      Frag: Junior Suite  5837.wav   
3  5839  Sag: Ich möchte mit Postkarte bezahlen  5839.wav   
4  5840           Frag: Gibt es einen Coiffeur?  5840.wav   

                          RecResult                     Transcription  \
0            a room for four nights            a room for four nights   
1                      junior suite                    junior*z suite   
2                            junior                    kinderzimmer*v   
3  i would like to pay by post card  i would like to pay by post card   
4            is there a hairdresser            is there a hairdresser   

   language  meaning  
0         1        1  
1         1        1  
2         0        0  
3         1        1  
4         1        1  
(995, 7)
(995, 5)
(995, 12)


## Form a DF for modeling
- df_17_X, only using Id, language, and meaning cols
- using Id to merge other features DFs


- 1/16/2018 Add more features provided by Huy

In [280]:
df_17_train_huy = pd.read_csv('../data/processed/textProcessing_trainingKaldi_features.csv', sep='\t')
df_17_test_huy = pd.read_csv('../data/processed/textProcessing_testKaldi_annotated_features.csv',
                          sep = '\t')
df_17_train_huy = df_17_train_huy.drop(['CLASS'], axis=1)
df_17_test_huy = df_17_test_huy.drop(['CLASS'], axis=1)
print(df_17_train_huy.shape)
print(df_17_train_huy.columns)
print(df_17_test_huy.shape)

(5222, 50)
Index(['Id', 'ppl-ref', 'ppl-ref_pos', 'ppl-ref_prod', 'ppl-ref_dep',
       'ppl-prompt', 'ppl-prompt_pos', 'ppl-correct', 'ppl-correct_pos',
       'ppl-correct_prod', 'ppl-correct_dep', 'ppl-ge', 'ppl-ge_pos',
       'ppl-incorrect', 'ppl-incorrect_pos', 'ppl-incorrect_prod',
       'ppl-incorrect_dep', 'maxsim_15_skip', 'maxsim_30_skip',
       'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw',
       'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'ngram_match',
       'ngram_match-lem', 'ngram_unseen-1', 'ngram_unseen-2', 'ngram_unseen-3',
       'error_yn', 'error_count', 'error_ratio', 'parse_score-ratio',
       'length_ratio', 'length_01', 'length_under-min', 'length_above-max',
       'length_unknown', 'length_unknown-ratio', 'length_sounds',
       'length_sounds-ratio', 'prompt_missing', 'prompt_missing-pct',
       'prompt_DT', 'prompt_IN', 'prompt_MD', 'prompt_NN', 'prompt_VB'],
      dtype='object')
(995, 50)


In [281]:
df_17_train_vecsim.describe()

Unnamed: 0,Id,cos_mean,cos_max,wmd_mean,wmd_min
count,5221.0,5139.0,5139.0,5221.0,5221.0
mean,8797.797931,0.845628,0.975997,inf,inf
std,1695.111124,0.095431,0.066667,,
min,5835.0,0.215128,0.26132,0.0,0.0
25%,7344.0,0.786995,0.992128,1.147014,0.0
50%,8832.0,0.869039,1.0,1.427193,0.0
75%,10262.0,0.914371,1.0,1.760802,0.253958
max,11676.0,1.0,1.0,inf,inf


In [282]:
def gen_ml_df(df_main, df_vecsim, df_grmerr, df_huy):
    df_ml = df_main[['Id', 'language', 'meaning']]
    df_ml = pd.merge(df_ml, df_vecsim, on='Id')
    # df_ml = pd.merge(df_ml, df_grmerr, on='Id')
    df_ml = pd.merge(df_ml, df_huy, on='Id')

    with pd.option_context('mode.use_inf_as_null', True): # handle inf as NA
        df_ml['wmd_mean'].fillna(10.0, inplace=True)      # fillna to keep all rows for test
        df_ml['wmd_min'].fillna(10.0, inplace=True)
        df_ml.fillna(999, inplace=True)
    return df_ml

## Process 2017 train DF

In [283]:
df_17_train_ml = gen_ml_df(df_17_train, df_17_train_vecsim, df_17_train_grmerr, df_17_train_huy)
display(df_17_train_ml.describe())
print(df_17_train_ml.shape)


Unnamed: 0,Id,language,meaning,cos_mean,cos_max,wmd_mean,wmd_min,ppl-ref,ppl-ref_pos,ppl-ref_prod,...,length_unknown-ratio,length_sounds,length_sounds-ratio,prompt_missing,prompt_missing-pct,prompt_DT,prompt_IN,prompt_MD,prompt_NN,prompt_VB
count,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,...,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0,5221.0
mean,8797.797931,0.742961,0.896572,16.522444,16.650766,1.485734,0.235018,-28.47089,-6.312515,-13.310218,...,0.0414,5.513886,0.793526,1.473473,0.311583,0.154951,0.149397,0.08868,0.316031,0.279257
std,1695.111124,0.437043,0.304547,124.117202,124.100973,0.604347,0.55661,6.698166,3.372112,4.983533,...,0.122776,1.871967,0.404813,1.576745,0.324866,0.361893,0.356513,0.284309,0.46497,0.448677
min,5835.0,0.0,0.0,0.215128,0.26132,0.0,0.0,-86.41401,-39.53238,-47.33208,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7344.0,0.0,1.0,0.788851,0.993169,1.147014,0.0,-30.96626,-6.56634,-16.36454,...,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8832.0,1.0,1.0,0.870139,1.0,1.427193,0.0,-26.21445,-5.486407,-12.81166,...,0.0,5.0,1.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0
75%,10262.0,1.0,1.0,0.917168,1.0,1.760802,0.253958,-24.05841,-4.201643,-9.623078,...,0.0,7.0,1.0,2.0,0.5,0.0,0.0,0.0,1.0,1.0
max,11676.0,1.0,1.0,999.0,999.0,10.0,10.0,-22.0346,-0.924857,-3.28985,...,1.0,20.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0


(5221, 56)


## Process 2017 test DF

In [284]:
df_17_test_ml = gen_ml_df(df_17_test, df_17_test_vecsim, df_17_test_grmerr, df_17_test_huy)
display(df_17_test_ml.describe())
print(df_17_test_ml.shape)

Unnamed: 0,Id,language,meaning,cos_mean,cos_max,wmd_mean,wmd_min,ppl-ref,ppl-ref_pos,ppl-ref_prod,...,length_unknown-ratio,length_sounds,length_sounds-ratio,prompt_missing,prompt_missing-pct,prompt_DT,prompt_IN,prompt_MD,prompt_NN,prompt_VB
count,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,...,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0
mean,4149.836181,0.719598,0.879397,20.8625,20.980447,1.729,0.644203,-29.179244,-6.759619,-13.4526,...,0.053771,5.571859,0.718593,1.633166,0.348945,0.167839,0.119598,0.090452,0.368844,0.307538
std,325.004628,0.449422,0.325829,140.162276,140.145371,0.758546,0.869964,7.122178,4.169517,5.646571,...,0.123538,2.0827,0.449912,1.652723,0.334286,0.373911,0.324654,0.286973,0.482734,0.461706
min,3571.0,0.0,0.0,0.166202,0.2454,0.0,0.0,-104.0234,-63.01341,-65.30461,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3869.5,0.0,1.0,0.742943,0.876706,1.265484,0.0,-31.67411,-7.734252,-16.6706,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4152.0,1.0,1.0,0.835361,0.990026,1.596772,0.345527,-26.92076,-5.545169,-12.55646,...,0.0,5.0,1.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0
75%,4435.5,1.0,1.0,0.897703,1.0,2.052417,1.124859,-24.30088,-4.223809,-9.576643,...,0.0,7.0,1.0,3.0,0.571429,0.0,0.0,0.0,1.0,1.0
max,4701.0,1.0,1.0,999.0,999.0,10.0,10.0,-22.0346,-0.924857,-3.477604,...,1.0,27.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0


(995, 56)


## Correlation analysis
Only show |R| more than $0.2$

In [285]:
tb_corr = df_17_train_ml.corr()
CORR_CUT = 0.1

tb_l = tb_corr['language'].drop(['language', 'meaning'])
display(tb_l[tb_l > CORR_CUT])
display(tb_l[tb_l < -1 * CORR_CUT])
col_l = list(tb_l[tb_l > CORR_CUT].index) + list(tb_l[tb_l < -1 * CORR_CUT].index)
display(col_l) # 23

ppl-ref                0.412027
ppl-ref_pos            0.252779
ppl-ref_prod           0.150436
ppl-ref_dep            0.254020
ppl-prompt             0.161701
ppl-prompt_pos         0.158218
ppl-correct            0.448417
ppl-correct_pos        0.335484
ppl-correct_prod       0.202784
ppl-correct_dep        0.109184
ppl-ge                 0.102196
ppl-ge_pos             0.115325
ppl-incorrect_prod     0.110023
ppl-incorrect_dep      0.109184
maxsim_15_skip         0.295474
maxsim_30_skip         0.304698
maxsim_50_skip         0.301699
maxsim_15_cbw          0.299532
maxsim_30_cbw          0.309556
maxsim_50_cbw          0.307635
lda_sim-max            0.241508
lda_sim-min            0.111859
lda_sim-avg            0.180788
ngram_match            0.175150
ngram_match-lem        0.165776
length_sounds-ratio    0.104937
Name: language, dtype: float64

wmd_mean             -0.186249
wmd_min              -0.644960
ppl-incorrect        -0.135479
ngram_unseen-1       -0.368065
ngram_unseen-2       -0.417670
length_under-min     -0.127047
length_above-max     -0.125318
length_sounds        -0.140345
prompt_missing       -0.370462
prompt_missing-pct   -0.280889
prompt_IN            -0.269888
prompt_MD            -0.134119
prompt_NN            -0.266681
prompt_VB            -0.223953
Name: language, dtype: float64

['ppl-ref',
 'ppl-ref_pos',
 'ppl-ref_prod',
 'ppl-ref_dep',
 'ppl-prompt',
 'ppl-prompt_pos',
 'ppl-correct',
 'ppl-correct_pos',
 'ppl-correct_prod',
 'ppl-correct_dep',
 'ppl-ge',
 'ppl-ge_pos',
 'ppl-incorrect_prod',
 'ppl-incorrect_dep',
 'maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'lda_sim-max',
 'lda_sim-min',
 'lda_sim-avg',
 'ngram_match',
 'ngram_match-lem',
 'length_sounds-ratio',
 'wmd_mean',
 'wmd_min',
 'ppl-incorrect',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'length_under-min',
 'length_above-max',
 'length_sounds',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_IN',
 'prompt_MD',
 'prompt_NN',
 'prompt_VB']

## Debug
Show feature-score correlation on train vs. test set. Just found that wmd_min has a big loss, this may cause adding wmd_min in the model made the result worse.


In [286]:
test_corr = df_17_test_ml.corr()
pd.DataFrame({'test': test_corr['language'].loc[col_l], 'train': tb_l.loc[col_l]})

Unnamed: 0,test,train
ppl-ref,0.396517,0.412027
ppl-ref_pos,0.289611,0.252779
ppl-ref_prod,0.219674,0.150436
ppl-ref_dep,0.266814,0.25402
ppl-prompt,0.186512,0.161701
ppl-prompt_pos,0.237671,0.158218
ppl-correct,0.38808,0.448417
ppl-correct_pos,0.322818,0.335484
ppl-correct_prod,0.246284,0.202784
ppl-correct_dep,0.136569,0.109184


In [287]:
# display(tb_corr['meaning'])
tb_m = tb_corr['meaning']
tb_m = tb_corr['meaning'].drop(['language', 'meaning'])
display(tb_m[tb_m > CORR_CUT])
display(tb_m[tb_m < -1 * CORR_CUT])
col_m = list(tb_m[tb_m > CORR_CUT].index) + list(tb_m[tb_m < -1 * CORR_CUT].index)
display(col_m) # 21

ppl-ref            0.245756
ppl-correct        0.271550
ppl-correct_pos    0.142141
maxsim_15_skip     0.319802
maxsim_30_skip     0.327217
maxsim_50_skip     0.328028
maxsim_15_cbw      0.324346
maxsim_30_cbw      0.331116
maxsim_50_cbw      0.333937
lda_sim-max        0.285927
lda_sim-min        0.189392
lda_sim-avg        0.262375
ngram_match        0.211007
ngram_match-lem    0.209511
Name: meaning, dtype: float64

wmd_mean             -0.297998
wmd_min              -0.645492
ngram_unseen-1       -0.373347
ngram_unseen-2       -0.352869
length_under-min     -0.186060
length_above-max     -0.155768
prompt_missing       -0.284977
prompt_missing-pct   -0.285272
prompt_IN            -0.134670
prompt_NN            -0.275093
prompt_VB            -0.123656
Name: meaning, dtype: float64

['ppl-ref',
 'ppl-correct',
 'ppl-correct_pos',
 'maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'lda_sim-max',
 'lda_sim-min',
 'lda_sim-avg',
 'ngram_match',
 'ngram_match-lem',
 'wmd_mean',
 'wmd_min',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'length_under-min',
 'length_above-max',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_IN',
 'prompt_NN',
 'prompt_VB']

In [288]:
pd.DataFrame({'test': test_corr['meaning'].loc[col_m], 'train': tb_m.loc[col_m]})

Unnamed: 0,test,train
ppl-ref,0.270159,0.245756
ppl-correct,0.279,0.27155
ppl-correct_pos,0.194917,0.142141
maxsim_15_skip,0.251806,0.319802
maxsim_30_skip,0.269112,0.327217
maxsim_50_skip,0.265597,0.328028
maxsim_15_cbw,0.251862,0.324346
maxsim_30_cbw,0.272667,0.331116
maxsim_50_cbw,0.26774,0.333937
lda_sim-max,0.302258,0.285927


In [289]:
# convert pandas DF to numpy array
# remove wmd_min
col_l.remove('wmd_min')
col_m.remove('wmd_min')

def get_langauge_X(df):
    X = df.loc[:, col_l].values
    return X

def get_langauge_y(df):
    return df['language'].values

def get_meaning_X(df):
    X = df.loc[:, col_m].values
    return X

def get_meaning_y(df):
    return df['meaning'].values

## z-norm all features

In [290]:
from sklearn.preprocessing.data import StandardScaler, MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1)) # for SVM
#scaler = StandardScaler()

In [291]:
year17_lang_train_X = get_langauge_X(df_17_train_ml)
year17_lang_train_y = get_langauge_y(df_17_train_ml)
year17_lang_test_X = get_langauge_X(df_17_test_ml)
year17_lang_test_y = get_langauge_y(df_17_test_ml)

scaler.fit(year17_lang_train_X)
year17_lang_train_X = scaler.transform(year17_lang_train_X)
year17_lang_test_X = scaler.transform(year17_lang_test_X)

year17_meaning_train_X = get_meaning_X(df_17_train_ml)
year17_meaning_train_y = get_meaning_y(df_17_train_ml)
year17_meaning_test_X = get_meaning_X(df_17_test_ml)
year17_meaning_test_y = get_meaning_y(df_17_test_ml)

scaler.fit(year17_meaning_train_X)
year17_meaning_train_X = scaler.transform(year17_meaning_train_X)
year17_meaning_test_X = scaler.transform(year17_meaning_test_X)

print(year17_lang_train_X.shape)
print(year17_meaning_train_X.shape)
print(year17_lang_train_y.shape)
print(year17_meaning_train_y.shape)

print(year17_lang_test_X.shape)
print(year17_meaning_test_X.shape)

(5221, 39)
(5221, 24)
(5221,)
(5221,)
(995, 39)
(995, 24)


Pickle all year17 numpy arraies

In [292]:
with open('../data/processed/numpy/year17_withHuy.pkl', 'wb') as pf:
    pickle.dump([year17_lang_train_X,
                 year17_lang_train_y,
                 year17_lang_test_X,
                 year17_lang_test_y,
                 year17_meaning_train_X,
                 year17_meaning_train_y,
                 year17_meaning_test_X,
                 year17_meaning_test_y], pf)
