In [57]:
%matplotlib inline
import pickle
import pandas as pd
from IPython.display import display

# Data loading
## Loading 2017 text task data for Id and scores

In [58]:
with open('../data/processed/data.pkl', 'rb') as pf:
    objs = pickle.load(pf)

grammar_dic = objs[0]
df_train = objs[1]   # using RecResult
df_test = objs[2]    # using RecResult

recode = lambda x: 1 if x=='correct' else 0
df_train['language'] = df_train['language'].apply(recode)
df_test['language'] = df_test['language'].apply(recode)
df_train['meaning'] = df_train['meaning'].apply(recode)
df_test['meaning'] = df_test['meaning'].apply(recode)

print(df_train.head(5))
print('train: {} test: {}'.format(df_train.shape, df_test.shape))

     Id                                  Prompt   Wavfile  \
0  5835               Frag: Zimmer für 4 Nächte  5835.wav   
1  5836                      Frag: Junior Suite  5836.wav   
2  5837                      Frag: Junior Suite  5837.wav   
3  5839  Sag: Ich möchte mit Postkarte bezahlen  5839.wav   
4  5840           Frag: Gibt es einen Coiffeur?  5840.wav   

                          RecResult                     Transcription  \
0            a room for four nights            a room for four nights   
1                      junior suite                    junior*z suite   
2                            junior                    kinderzimmer*v   
3  i would like to pay by post card  i would like to pay by post card   
4            is there a hairdresser            is there a hairdresser   

   language  meaning  
0         1        1  
1         1        1  
2         0        0  
3         1        1  
4         1        1  
train: (5221, 7) test: (995, 7)


## Loading Grammar Error Count Features

In [59]:
df_train_grmerr = pd.read_csv('../data/processed/df17_train_grmerror.csv')
df_test_grmerr = pd.read_csv('../data/processed/df17_test_grmerror.csv')
print('grammar error: train:{} test:{}'.format(df_train_grmerr.shape, df_test_grmerr.shape))

grammar error: train:(5218, 12) test:(993, 12)


## Loading Huy's Features

In [60]:
# Huy's features diff versions
df_train_huy = pd.read_csv('../data/processed/Huy/textProcessing_trainingKaldi_features.csv', sep='\t')
df_test_huy = pd.read_csv('../data/processed/Huy/textProcessing_testKaldi_annotated_features.csv',
                          sep = '\t')
df_train_huy.rename(columns={'ID' : 'Id'}, inplace=True)
df_test_huy.rename(columns={'ID' : 'Id'}, inplace=True)

df_train_huy = df_train_huy.drop(['CLASS'], axis=1)
df_test_huy = df_test_huy.drop(['CLASS'], axis=1)
print(df_train_huy.columns)

print('Huy features train:{}\ttest:{}'.format(df_train_huy.shape, df_test_huy.shape))

Index(['Id', 'maxsim_15_skip', 'maxsim_30_skip', 'maxsim_50_skip',
       'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw', 'lda_sim-max',
       'lda_sim-min', 'lda_sim-avg',
       ...
       'edit_pattern-VBP-OO', 'edit_pattern-VBP-VB', 'edit_pattern-VBP-VBZ',
       'edit_pattern-VBZ-IN', 'edit_pattern-VBZ-NN', 'edit_pattern-VBZ-OO',
       'edit_pattern-VBZ-VBP', 'edit_pattern-WRB-DT', 'edit_pattern-WRB-NN',
       'edit_pattern-WRB-OO'],
      dtype='object', length=170)
Huy features train:(5222, 170)	test:(995, 170)


# Form a DF for modeling
- df_17_X, only using Id, language, and meaning cols
- using Id to merge other features DFs


- 1/16/2018 Add more features provided by Huy
- 1/24/2018 Just keep one DF for langauge prediction
- 1/24/2018 Added updated grammar error count features (based on kaldi ASR)
- 1/26/2018 Updated Huy's features to his latest version (1/24)

## Options for pickled file output

In [61]:
# feature selection cutoff
CORR_CUT = 0.2

# output pickle name
# pkl_file = 'year17_withHuy.pil"  # 1/25/2018 using Huy's version 1 features and |R| cutoff is 0.1
# pkl_file = 'y17_text_v1_r01.pkl' # 1/26/2018 same to year17_withHuy #feature=40  
# pkl_file = 'y17_text_v1_r02.pkl' # 1/26/2018 same to year17_withHuy  |R| cutoff is 0.2 => #feature=21  
# pkl_file = 'y17_text_r01.pkl'    # 1/26/2018 using Huy's updated features and |R| cutoff is 0.1 => #feature=46
# pkl_file = 'y17_text_r02.pkl'    # 1/26/2018 using Huy's updated features and |R| cutoff is 0.2 => #feature=20

pkl_file = 'y17_text_RFECV.pkl'    # 1/29/2018 using Huy's updated features and RFECV(RandomForest)

In [62]:
def gen_ml_df(df_main, df_grmerr, df_huy):
    df_ml = df_main[['Id', 'language', 'meaning']]
    df_ml = pd.merge(df_ml, df_grmerr, on='Id', how='outer')  # grmerr may miss some Ids due to ASR null outputs. use outer merging to keep all Ids.
    df_ml = pd.merge(df_ml, df_huy, on='Id', how='outer')
    df_ml.fillna(0, inplace=True)
    return df_ml

## Process 2017 train DF

In [63]:
df_train_ml = gen_ml_df(df_train, df_train_grmerr, df_train_huy)
display(df_train_ml.describe())
print(df_train_ml.shape)

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,edit_pattern-VBP-OO,edit_pattern-VBP-VB,edit_pattern-VBP-VBZ,edit_pattern-VBZ-IN,edit_pattern-VBZ-NN,edit_pattern-VBZ-OO,edit_pattern-VBZ-VBP,edit_pattern-WRB-DT,edit_pattern-WRB-NN,edit_pattern-WRB-OO
count,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,...,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0
mean,8797.815396,0.742819,0.8964,0.0,0.0,0.00134,0.015128,0.0,0.000766,0.022788,...,0.008043,0.02049,0.002298,0.001915,0.000957,0.007851,0.002106,0.000957,0.001915,0.00383
std,1694.94925,0.437122,0.30477,0.0,0.0,0.036592,0.122075,0.0,0.027669,0.15052,...,0.089329,0.141684,0.047887,0.043723,0.030931,0.088268,0.045852,0.030931,0.043723,0.061774
min,5835.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7344.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8832.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10261.75,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11676.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


(5222, 183)


## Process 2017 test DF

In [64]:
df_test_ml = gen_ml_df(df_test, df_test_grmerr, df_test_huy)
display(df_test_ml.describe())
print(df_test_ml.shape)

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,edit_pattern-VBP-OO,edit_pattern-VBP-VB,edit_pattern-VBP-VBZ,edit_pattern-VBZ-IN,edit_pattern-VBZ-NN,edit_pattern-VBZ-OO,edit_pattern-VBZ-VBP,edit_pattern-WRB-DT,edit_pattern-WRB-NN,edit_pattern-WRB-OO
count,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,...,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0
mean,4149.836181,0.719598,0.879397,0.003015,0.0,0.001005,0.021106,0.0,0.00201,0.032161,...,0.017085,0.021106,0.003015,0.00201,0.0,0.023116,0.00201,0.00402,0.0,0.011055
std,325.004628,0.449422,0.325829,0.054854,0.0,0.031702,0.150642,0.0,0.044811,0.176516,...,0.129655,0.143808,0.054854,0.044811,0.0,0.150346,0.044811,0.063309,0.0,0.104614
min,3571.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3869.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4152.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4435.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4701.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0


(995, 183)


## Correlation analysis
Only show |R| more than $CORR_{CUT}$

In [65]:
tb_corr = df_train_ml.corr()

tb_l = tb_corr['language'].drop(['language', 'meaning'])
display(tb_l[tb_l > CORR_CUT])
display(tb_l[tb_l < -1 * CORR_CUT])
col_l = list(tb_l[tb_l > CORR_CUT].index) + list(tb_l[tb_l < -1 * CORR_CUT].index)
display(col_l)
len(col_l)

maxsim_15_skip      0.295467
maxsim_30_skip      0.304685
maxsim_50_skip      0.301643
maxsim_15_cbw       0.299530
maxsim_30_cbw       0.309548
maxsim_50_cbw       0.307581
lda_sim-max         0.241274
ppl-ref             0.420239
ppl-ref_pos         0.270682
ppl-ref_dep         0.254200
ppl-correct         0.449269
ppl-correct_pos     0.352131
ppl-correct_prod    0.202965
Name: language, dtype: float64

Article_error_count   -0.255004
ngram_unseen-1        -0.367958
ngram_unseen-2        -0.417572
prompt_missing        -0.370641
prompt_missing-pct    -0.280874
prompt_IN             -0.270443
prompt_NN             -0.266374
prompt_VB             -0.224359
edit_distance         -0.364061
Name: language, dtype: float64

['maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'lda_sim-max',
 'ppl-ref',
 'ppl-ref_pos',
 'ppl-ref_dep',
 'ppl-correct',
 'ppl-correct_pos',
 'ppl-correct_prod',
 'Article_error_count',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_IN',
 'prompt_NN',
 'prompt_VB',
 'edit_distance']

22

## Feature Selection

In [66]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier

X = df_train_ml.iloc[:, 3:].values
y = df_train_ml['language'].values

selector = RFECV(estimator=RandomForestClassifier(random_state=2018), cv=5, step=1, verbose=5, n_jobs=-1)
selector.fit(X, y)
feat_names = df_train_ml.columns[3:]
col_l = list(feat_names[selector.get_support()])

Fitting estimator with 180 features.
Fitting estimator with 180 features.
Fitting estimator with 180 features.
Fitting estimator with 180 features.
Fitting estimator with 180 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 179 features.
Fitting estimator with 178 features.
Fitting estimator with 178 features.
Fitting estimator with 178 features.
Fitting estimator with 178 features.
Fitting estimator with 178 features.
Fitting estimator with 177 features.
Fitting estimator with 177 features.
Fitting estimator with 177 features.
Fitting estimator with 177 features.
Fitting estimator with 177 features.
Fitting estimator with 176 features.
Fitting estimator with 176 features.
Fitting estimator with 176 features.
Fitting estimator with 176 features.
Fitting estimator with 176 features.
Fitting estimator with 175 features.
Fitting estimator with 175 features.
F

Fitting estimator with 136 features.
Fitting estimator with 137 features.
Fitting estimator with 134 features.
Fitting estimator with 135 features.
Fitting estimator with 136 features.
Fitting estimator with 135 features.
Fitting estimator with 136 features.
Fitting estimator with 133 features.
Fitting estimator with 135 features.
Fitting estimator with 134 features.
Fitting estimator with 134 features.
Fitting estimator with 135 features.
Fitting estimator with 133 features.
Fitting estimator with 132 features.
Fitting estimator with 133 features.
Fitting estimator with 134 features.
Fitting estimator with 134 features.
Fitting estimator with 133 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 132 features.
Fitting estimator with 133 features.
Fitting estimator with 130 features.
Fitting estimator with 131 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 132 features.
F

Fitting estimator with 90 features.
Fitting estimator with 92 features.
Fitting estimator with 90 features.
Fitting estimator with 91 features.
Fitting estimator with 92 features.
Fitting estimator with 89 features.
Fitting estimator with 91 features.
Fitting estimator with 89 features.
Fitting estimator with 90 features.
Fitting estimator with 91 features.
Fitting estimator with 88 features.
Fitting estimator with 88 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 90 features.
Fitting estimator with 87 features.
Fitting estimator with 87 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 89 features.
Fitting estimator with 86 features.
Fitting estimator with 86 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 88 features.
Fitting estimator with 85 features.
Fitting estimator with 85 features.
Fitting estimator with 87 fe

Fitting estimator with 47 features.
Fitting estimator with 44 features.
Fitting estimator with 46 features.
Fitting estimator with 44 features.
Fitting estimator with 46 features.
Fitting estimator with 46 features.
Fitting estimator with 43 features.
Fitting estimator with 45 features.
Fitting estimator with 43 features.
Fitting estimator with 45 features.
Fitting estimator with 45 features.
Fitting estimator with 42 features.
Fitting estimator with 44 features.
Fitting estimator with 42 features.
Fitting estimator with 44 features.
Fitting estimator with 44 features.
Fitting estimator with 41 features.
Fitting estimator with 43 features.
Fitting estimator with 41 features.
Fitting estimator with 43 features.
Fitting estimator with 43 features.
Fitting estimator with 40 features.
Fitting estimator with 42 features.
Fitting estimator with 40 features.
Fitting estimator with 42 features.
Fitting estimator with 42 features.
Fitting estimator with 39 features.
Fitting estimator with 41 fe

In [67]:
print(col_l)
print('RFECV found {} features'.format(len(col_l)))

['noun_error_count', 'Article_error_count', 'VerbCollocation_error_count', 'maxsim_15_skip', 'maxsim_30_skip', 'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw', 'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'ppl-ref', 'ppl-ref_pos', 'ppl-ref_prod', 'ppl-ref_dep', 'ppl-prompt', 'ppl-prompt_pos', 'ppl-correct', 'ppl-correct_pos', 'ppl-correct_prod', 'ppl-correct_dep', 'ppl-ge', 'ppl-ge_pos', 'ppl-incorrect', 'ppl-incorrect_pos', 'ppl-incorrect_prod', 'ppl-incorrect_dep', 'ngram_match', 'ngram_match-lem', 'ngram_unseen-1', 'ngram_unseen-2', 'error_yn', 'error_count', 'error_ratio', 'parse_score-ratio', 'length_ratio', 'length_under-min', 'length_unknown', 'length_unknown-ratio', 'length_sounds', 'length_sounds-ratio', 'prompt_missing', 'prompt_missing-pct', 'prompt_DT', 'prompt_IN', 'prompt_MD', 'prompt_NN', 'prompt_VB', 'edit_distance', 'edit_pattern-LS-NN', 'edit_pattern-OO-DT', 'edit_pattern-OO-JJ', 'edit_pattern-OO-NN', 'edit_pattern-OO-VB']
RFECV found 55 features


In [68]:
# convert pandas DF to numpy array

def get_langauge_X(df):
    X = df.loc[:, col_l].values
    return X

def get_langauge_y(df):
    return df['language'].values

def get_meaning_y(df):
    return df['meaning'].values

## z-norm all features

In [69]:
from sklearn.preprocessing.data import StandardScaler
scaler = StandardScaler()

In [70]:
lang_train_X = get_langauge_X(df_train_ml)
lang_train_y = get_langauge_y(df_train_ml)
lang_test_X = get_langauge_X(df_test_ml)
lang_test_y = get_langauge_y(df_test_ml)

scaler.fit(lang_train_X)
lang_train_X = scaler.transform(lang_train_X)
lang_test_X = scaler.transform(lang_test_X)

meaning_train_y = get_meaning_y(df_train_ml)
meaning_test_y = get_meaning_y(df_test_ml)

print(lang_train_X.shape)
print(lang_train_y.shape)
print(lang_test_X.shape)
print(lang_test_y.shape)

(5222, 55)
(5222,)
(995, 55)
(995,)


## Pickle all year17 numpy arrays

In [71]:

with open('../data/processed/numpy/' + pkl_file, 'wb') as pf:
    pickle.dump([lang_train_X,
                 lang_train_y,
                 lang_test_X,
                 lang_test_y,
                 meaning_train_y,
                 meaning_test_y], pf)
