In [180]:
%matplotlib inline
import pickle
import pandas as pd
from IPython.display import display

# Loading 2017 train data

In [181]:
with open('../data/processed/data.pkl', 'rb') as pf:
    objs = pickle.load(pf)

grammar_dic = objs[0]
df_17_train = objs[1]   # using RecResult
df_17_test = objs[2]    # using RecResult

recode = lambda x: 1 if x=='correct' else 0
df_17_train['language'] = df_17_train['language'].apply(recode)
df_17_test['language'] = df_17_test['language'].apply(recode)
df_17_train['meaning'] = df_17_train['meaning'].apply(recode)
df_17_test['meaning'] = df_17_test['meaning'].apply(recode)

# load various types of features

# df_17_train_vecsim = pd.read_csv('../data/processed/df17_train_fasttext.csv')
df_17_train_grmerr = pd.read_csv('../data/processed/df17_train_grmerror.csv')
# df_17_test_vecsim = pd.read_csv('../data/processed/df17_test_fasttext.csv')
df_17_test_grmerr = pd.read_csv('../data/processed/df17_test_grmerror.csv')

print(df_17_train.head(5))
print(df_17_test.shape)
# print(df_17_test_vecsim.shape)
print(df_17_test_grmerr.shape)

     Id                                  Prompt   Wavfile  \
0  5835               Frag: Zimmer für 4 Nächte  5835.wav   
1  5836                      Frag: Junior Suite  5836.wav   
2  5837                      Frag: Junior Suite  5837.wav   
3  5839  Sag: Ich möchte mit Postkarte bezahlen  5839.wav   
4  5840           Frag: Gibt es einen Coiffeur?  5840.wav   

                          RecResult                     Transcription  \
0            a room for four nights            a room for four nights   
1                      junior suite                    junior*z suite   
2                            junior                    kinderzimmer*v   
3  i would like to pay by post card  i would like to pay by post card   
4            is there a hairdresser            is there a hairdresser   

   language  meaning  
0         1        1  
1         1        1  
2         0        0  
3         1        1  
4         1        1  
(995, 7)
(993, 12)


## Form a DF for modeling
- df_17_X, only using Id, language, and meaning cols
- using Id to merge other features DFs


- 1/16/2018 Add more features provided by Huy
- 1/24/2018 Just keep one DF for langauge prediction
- 1/24/2018 Added updated grammar error count features (based on kaldi ASR)
- 1/26/2018 Updated Huy's features to his latest version (1/24)

### Options for pickled file output

In [182]:
# Huy's features diff versions
df_17_train_huy = pd.read_csv('../data/processed/Huy/v1/textProcessing_trainingKaldi_features.csv', sep='\t')
df_17_test_huy = pd.read_csv('../data/processed/Huy/v1/textProcessing_testKaldi_annotated_features.csv',
                          sep = '\t')

# feature selection cutoff
CORR_CUT = 0.2

# output pickle name
# pkl_file = 'year17_withHuy.pil"  # 1/25/2018 using Huy's version 1 features and |R| cutoff is 0.1
# pkl_file = 'y17_text_v1_r01.pkl' # 1/26/2018 same to year17_withHuy #feature=40  
pkl_file = 'y17_text_v1_r02.pkl' # 1/26/2018 same to year17_withHuy  |R| cutoff is 0.2 => #feature=21  
# pkl_file = 'y17_text_r01.pkl'    # 1/26/2018 using Huy's updated features and |R| cutoff is 0.1 => #feature=46
# pkl_file = 'y17_text_r02.pkl'    # 1/26/2018 using Huy's updated features and |R| cutoff is 0.2 => #feature=20

In [183]:
df_17_train_huy.rename(columns={'ID' : 'Id'}, inplace=True)
df_17_test_huy.rename(columns={'ID' : 'Id'}, inplace=True)

df_17_train_huy = df_17_train_huy.drop(['CLASS'], axis=1)
df_17_test_huy = df_17_test_huy.drop(['CLASS'], axis=1)
print(df_17_train_huy.columns)

print('Huy features train:{}\ttest:{}'.format(df_17_train_huy.shape, df_17_test_huy.shape))
# Hack way to assert both train and test have the same shape
shared_cols = df_17_test_huy.columns
df_17_train_huy = df_17_train_huy.loc[:, shared_cols]
print('updated train shape:{}'.format(df_17_train_huy.shape))

Index(['Id', 'ppl-ref', 'ppl-ref_pos', 'ppl-ref_prod', 'ppl-ref_dep',
       'ppl-prompt', 'ppl-prompt_pos', 'ppl-correct', 'ppl-correct_pos',
       'ppl-correct_prod', 'ppl-correct_dep', 'ppl-ge', 'ppl-ge_pos',
       'ppl-incorrect', 'ppl-incorrect_pos', 'ppl-incorrect_prod',
       'ppl-incorrect_dep', 'maxsim_15_skip', 'maxsim_30_skip',
       'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw',
       'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'ngram_match',
       'ngram_match-lem', 'ngram_unseen-1', 'ngram_unseen-2', 'ngram_unseen-3',
       'error_yn', 'error_count', 'error_ratio', 'parse_score-ratio',
       'length_ratio', 'length_01', 'length_under-min', 'length_above-max',
       'length_unknown', 'length_unknown-ratio', 'length_sounds',
       'length_sounds-ratio', 'prompt_missing', 'prompt_missing-pct',
       'prompt_DT', 'prompt_IN', 'prompt_MD', 'prompt_NN', 'prompt_VB'],
      dtype='object')
Huy features train:(5222, 50)	test:(995, 50)
updated tra

In [184]:
def gen_ml_df(df_main, df_grmerr, df_huy):
    df_ml = df_main[['Id', 'language', 'meaning']]
    df_ml = pd.merge(df_ml, df_grmerr, on='Id', how='outer')  # grmerr may miss some Ids due to ASR null outputs. use outer merging to keep all Ids.
    df_ml = pd.merge(df_ml, df_huy, on='Id', how='outer')
    df_ml.fillna(0, inplace=True)
    return df_ml

## Process 2017 train DF

In [185]:
df_17_train_ml = gen_ml_df(df_17_train, df_17_train_grmerr, df_17_train_huy)
display(df_17_train_ml.describe())
print(df_17_train_ml.shape)

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,length_unknown-ratio,length_sounds,length_sounds-ratio,prompt_missing,prompt_missing-pct,prompt_DT,prompt_IN,prompt_MD,prompt_NN,prompt_VB
count,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,...,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0
mean,8797.815396,0.742819,0.8964,0.0,0.0,0.00134,0.015128,0.0,0.000766,0.022788,...,0.041392,5.514171,0.793566,1.473765,0.311595,0.154921,0.14956,0.088855,0.315971,0.279395
std,1694.94925,0.437122,0.30477,0.0,0.0,0.036592,0.122075,0.0,0.027669,0.15052,...,0.122766,1.871901,0.404785,1.576736,0.324836,0.361864,0.356673,0.284561,0.464946,0.448745
min,5835.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7344.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8832.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,1.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0
75%,10261.75,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,1.0,2.0,0.5,0.0,0.0,0.0,1.0,1.0
max,11676.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,...,1.0,20.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0


(5222, 63)


## Process 2017 test DF

In [186]:
df_17_test_ml = gen_ml_df(df_17_test, df_17_test_grmerr, df_17_test_huy)
display(df_17_test_ml.describe())
print(df_17_test_ml.shape)

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,length_unknown-ratio,length_sounds,length_sounds-ratio,prompt_missing,prompt_missing-pct,prompt_DT,prompt_IN,prompt_MD,prompt_NN,prompt_VB
count,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,...,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0
mean,4149.836181,0.719598,0.879397,0.003015,0.0,0.001005,0.021106,0.0,0.00201,0.032161,...,0.053771,5.571859,0.718593,1.633166,0.348945,0.167839,0.119598,0.090452,0.368844,0.307538
std,325.004628,0.449422,0.325829,0.054854,0.0,0.031702,0.150642,0.0,0.044811,0.176516,...,0.123538,2.0827,0.449912,1.652723,0.334286,0.373911,0.324654,0.286973,0.482734,0.461706
min,3571.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3869.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4152.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,1.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0
75%,4435.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,1.0,3.0,0.571429,0.0,0.0,0.0,1.0,1.0
max,4701.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,...,1.0,27.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0


(995, 63)


## Correlation analysis
Only show |R| more than $CORR_{CUT}$

In [187]:
tb_corr = df_17_train_ml.corr()

tb_l = tb_corr['language'].drop(['language', 'meaning'])
display(tb_l[tb_l > CORR_CUT])
display(tb_l[tb_l < -1 * CORR_CUT])
col_l = list(tb_l[tb_l > CORR_CUT].index) + list(tb_l[tb_l < -1 * CORR_CUT].index)
display(col_l)
len(col_l)

ppl-ref             0.411854
ppl-ref_pos         0.252639
ppl-ref_dep         0.254200
ppl-correct         0.448171
ppl-correct_pos     0.335305
ppl-correct_prod    0.202965
maxsim_15_skip      0.295467
maxsim_30_skip      0.304685
maxsim_50_skip      0.301643
maxsim_15_cbw       0.299530
maxsim_30_cbw       0.309548
maxsim_50_cbw       0.307581
lda_sim-max         0.241274
Name: language, dtype: float64

Article_error_count   -0.255004
ngram_unseen-1        -0.367958
ngram_unseen-2        -0.417572
prompt_missing        -0.370641
prompt_missing-pct    -0.280874
prompt_IN             -0.270443
prompt_NN             -0.266374
prompt_VB             -0.224359
Name: language, dtype: float64

['ppl-ref',
 'ppl-ref_pos',
 'ppl-ref_dep',
 'ppl-correct',
 'ppl-correct_pos',
 'ppl-correct_prod',
 'maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'lda_sim-max',
 'Article_error_count',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_IN',
 'prompt_NN',
 'prompt_VB']

21

In [188]:
# convert pandas DF to numpy array

def get_langauge_X(df):
    X = df.loc[:, col_l].values
    return X

def get_langauge_y(df):
    return df['language'].values

def get_meaning_y(df):
    return df['meaning'].values

## z-norm all features

In [189]:
from sklearn.preprocessing.data import StandardScaler, MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1)) # for SVM
#scaler = StandardScaler()

In [190]:
year17_lang_train_X = get_langauge_X(df_17_train_ml)
year17_lang_train_y = get_langauge_y(df_17_train_ml)
year17_lang_test_X = get_langauge_X(df_17_test_ml)
year17_lang_test_y = get_langauge_y(df_17_test_ml)

scaler.fit(year17_lang_train_X)
year17_lang_train_X = scaler.transform(year17_lang_train_X)
year17_lang_test_X = scaler.transform(year17_lang_test_X)

year17_meaning_train_y = get_meaning_y(df_17_train_ml)
year17_meaning_test_y = get_meaning_y(df_17_test_ml)

print(year17_lang_train_X.shape)
print(year17_lang_train_y.shape)
print(year17_lang_test_X.shape)
print(year17_lang_test_y.shape)

(5222, 21)
(5222,)
(995, 21)
(995,)


Pickle all year17 numpy arraies

In [191]:

with open('../data/processed/numpy/' + pkl_file, 'wb') as pf:
    pickle.dump([year17_lang_train_X,
                 year17_lang_train_y,
                 year17_lang_test_X,
                 year17_lang_test_y,
                 year17_meaning_train_y,
                 year17_meaning_test_y], pf)
