In [16]:
%matplotlib inline
import pickle
import pandas as pd
from IPython.display import display

# Data loading
## Loading 2017 text task data for Id and scores

In [17]:
with open('../data/processed/data.pkl', 'rb') as pf:
    objs = pickle.load(pf)

grammar_dic = objs[0]
df_train = objs[1]   # using RecResult
df_test = objs[2]    # using RecResult

recode = lambda x: 1 if x=='correct' else 0
df_train['language'] = df_train['language'].apply(recode)
df_test['language'] = df_test['language'].apply(recode)
df_train['meaning'] = df_train['meaning'].apply(recode)
df_test['meaning'] = df_test['meaning'].apply(recode)

print(df_train.head(5))
print('train: {} test: {}'.format(df_train.shape, df_test.shape))

     Id                                  Prompt   Wavfile  \
0  5835               Frag: Zimmer für 4 Nächte  5835.wav   
1  5836                      Frag: Junior Suite  5836.wav   
2  5837                      Frag: Junior Suite  5837.wav   
3  5839  Sag: Ich möchte mit Postkarte bezahlen  5839.wav   
4  5840           Frag: Gibt es einen Coiffeur?  5840.wav   

                          RecResult                     Transcription  \
0            a room for four nights            a room for four nights   
1                      junior suite                    junior*z suite   
2                            junior                    kinderzimmer*v   
3  i would like to pay by post card  i would like to pay by post card   
4            is there a hairdresser            is there a hairdresser   

   language  meaning  
0         1        1  
1         1        1  
2         0        0  
3         1        1  
4         1        1  
train: (5221, 7) test: (995, 7)


## Loading Grammar Error Count Features

In [18]:
# TODO: need update !!!

df_train_grmerr = pd.read_csv('../data/processed/df17_train_grmerror.csv')
df_test_grmerr = pd.read_csv('../data/processed/df17_test_grmerror.csv')
print('grammar error: train:{} test:{}'.format(df_train_grmerr.shape, df_test_grmerr.shape))

grammar error: train:(5218, 12) test:(993, 12)


## Loading Huy's Features

In [19]:
# Huy's features diff versions
df_train_huy = pd.read_csv('../data/processed/Huy/ASR/textProcessing_trainingKaldi_features.csv', sep='\t')
df_test_huy = pd.read_csv('../data/processed/Huy/ASR/textProcessing_testKaldi_annotated_features.csv',
                          sep = '\t')
df_train_huy.rename(columns={'ID' : 'Id'}, inplace=True)
df_test_huy.rename(columns={'ID' : 'Id'}, inplace=True)

df_train_huy = df_train_huy.drop(['CLASS'], axis=1)
df_test_huy = df_test_huy.drop(['CLASS'], axis=1)
print(df_train_huy.columns)

print('Huy features train:{}\ttest:{}'.format(df_train_huy.shape, df_test_huy.shape))

Index(['Id', 'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'maxsim_15_skip',
       'maxsim_30_skip', 'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw',
       'maxsim_50_cbw',
       ...
       'edit_pattern-VBN-VBP', 'edit_pattern-VBP-FW', 'edit_pattern-VBP-NN',
       'edit_pattern-VBP-OO', 'edit_pattern-VBP-VB', 'edit_pattern-VBZ-IN',
       'edit_pattern-VBZ-OO', 'edit_pattern-VBZ-VBP', 'edit_pattern-WP-WRB',
       'edit_pattern-WRB-OO'],
      dtype='object', length=146)
Huy features train:(5222, 146)	test:(995, 146)


# Form a DF for modeling


## Options for pickled file output

In [20]:
# feature selection cutoff
CORR_CUT = 0.2

pkl_file = 'y17_asr_RFECV.pkl'    # 1/29/2018 using Huy's updated features and RFECV(RandomForest)

In [21]:
def gen_ml_df(df_main, df_grmerr, df_huy):
    df_ml = df_main[['Id', 'language', 'meaning']]
    df_ml = pd.merge(df_ml, df_grmerr, on='Id', how='outer')  # grmerr may miss some Ids due to ASR null outputs. use outer merging to keep all Ids.
    df_ml = pd.merge(df_ml, df_huy, on='Id', how='outer')
    df_ml.fillna(0, inplace=True)
    return df_ml

## Process 2017 train DF

In [22]:
df_train_ml = gen_ml_df(df_train, df_train_grmerr, df_train_huy)
display(df_train_ml.describe())
print(df_train_ml.shape)

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,edit_pattern-VBN-VBP,edit_pattern-VBP-FW,edit_pattern-VBP-NN,edit_pattern-VBP-OO,edit_pattern-VBP-VB,edit_pattern-VBZ-IN,edit_pattern-VBZ-OO,edit_pattern-VBZ-VBP,edit_pattern-WP-WRB,edit_pattern-WRB-OO
count,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,...,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0,5222.0
mean,8797.815396,0.742819,0.8964,0.0,0.0,0.00134,0.015128,0.0,0.000766,0.022788,...,0.002106,0.002489,0.00134,0.005362,0.02432,0.001149,0.002681,0.001915,0.001149,0.004213
std,1694.94925,0.437122,0.30477,0.0,0.0,0.036592,0.122075,0.0,0.027669,0.15052,...,0.045852,0.049837,0.036592,0.073036,0.154056,0.03388,0.051714,0.043723,0.03388,0.064777
min,5835.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7344.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8832.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10261.75,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11676.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


(5222, 159)


## Process 2017 test DF

In [23]:
df_test_ml = gen_ml_df(df_test, df_test_grmerr, df_test_huy)
display(df_test_ml.describe())
print(df_test_ml.shape)

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,edit_pattern-VBN-VBP,edit_pattern-VBP-FW,edit_pattern-VBP-NN,edit_pattern-VBP-OO,edit_pattern-VBP-VB,edit_pattern-VBZ-IN,edit_pattern-VBZ-OO,edit_pattern-VBZ-VBP,edit_pattern-WP-WRB,edit_pattern-WRB-OO
count,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,...,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0
mean,4149.836181,0.719598,0.879397,0.003015,0.0,0.001005,0.021106,0.0,0.00201,0.032161,...,0.0,0.00804,0.001005,0.005025,0.025126,0.001005,0.005025,0.00603,0.0,0.00201
std,325.004628,0.449422,0.325829,0.054854,0.0,0.031702,0.150642,0.0,0.044811,0.176516,...,0.0,0.089351,0.031702,0.070745,0.156585,0.031702,0.070745,0.077458,0.0,0.044811
min,3571.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3869.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4152.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4435.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4701.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0


(995, 159)


## Correlation analysis
Only show |R| more than $CORR_{CUT}$

In [24]:
tb_corr = df_train_ml.corr()

tb_l = tb_corr['language'].drop(['language', 'meaning'])
display(tb_l[tb_l > CORR_CUT])
display(tb_l[tb_l < -1 * CORR_CUT])
col_l = list(tb_l[tb_l > CORR_CUT].index) + list(tb_l[tb_l < -1 * CORR_CUT].index)
display(col_l)
len(col_l)

lda_sim-max        0.255541
maxsim_15_skip     0.273850
maxsim_30_skip     0.279651
maxsim_50_skip     0.275131
maxsim_15_cbw      0.278338
maxsim_30_cbw      0.284162
maxsim_50_cbw      0.281034
ppl-ref            0.413557
ppl-ref_pos        0.252694
ppl-ref_dep        0.233459
ppl-correct        0.476343
ppl-correct_pos    0.328655
Name: language, dtype: float64

Article_error_count   -0.255004
ngram_unseen-1        -0.340773
ngram_unseen-2        -0.379758
prompt_missing        -0.362701
prompt_missing-pct    -0.270236
prompt_IN             -0.273897
prompt_NN             -0.276729
edit_distance         -0.396058
Name: language, dtype: float64

['lda_sim-max',
 'maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'ppl-ref',
 'ppl-ref_pos',
 'ppl-ref_dep',
 'ppl-correct',
 'ppl-correct_pos',
 'Article_error_count',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_IN',
 'prompt_NN',
 'edit_distance']

20

## Feature Selection

In [25]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier

X = df_train_ml.iloc[:, 3:].values
y = df_train_ml['language'].values

selector = RFECV(estimator=RandomForestClassifier(random_state=2018), cv=5, step=1, verbose=5, n_jobs=-1)
selector.fit(X, y)
feat_names = df_train_ml.columns[3:]
col_l = list(feat_names[selector.get_support()])

Fitting estimator with 156 features.
Fitting estimator with 156 features.
Fitting estimator with 156 features.
Fitting estimator with 156 features.
Fitting estimator with 156 features.
Fitting estimator with 155 features.
Fitting estimator with 155 features.
Fitting estimator with 155 features.
Fitting estimator with 155 features.
Fitting estimator with 155 features.
Fitting estimator with 154 features.
Fitting estimator with 154 features.
Fitting estimator with 154 features.
Fitting estimator with 154 features.
Fitting estimator with 154 features.
Fitting estimator with 153 features.
Fitting estimator with 153 features.
Fitting estimator with 153 features.
Fitting estimator with 153 features.
Fitting estimator with 153 features.
Fitting estimator with 152 features.
Fitting estimator with 152 features.
Fitting estimator with 152 features.
Fitting estimator with 152 features.
Fitting estimator with 152 features.
Fitting estimator with 151 features.
Fitting estimator with 151 features.
F

Fitting estimator with 112 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 111 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 109 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 108 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
F

Fitting estimator with 66 features.
Fitting estimator with 68 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 67 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 66 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 65 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 62 features.
Fitting estimator with 63 features.
Fitting estimator with 64 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 61 features.
Fitting estimator with 62 features.
Fitting estimator with 63 fe

Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 23 features.
Fitting estimator with 21 features.
Fitting estimator with 19 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 22 features.
Fitting estimator with 20 features.
Fitting estimator with 18 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 21 features.
Fitting estimator with 19 features.
Fitting estimator with 17 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 20 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 19 features.
Fitting estimator with 17 features.
Fitting estimator with 15 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 18 fe

In [26]:
print(col_l)
print('RFECV found {} features'.format(len(col_l)))

['noun_error_count', 'Article_error_count', 'Wordform_error_count', 'VerbCollocation_error_count', 'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'maxsim_15_skip', 'maxsim_30_skip', 'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw', 'ppl-ref', 'ppl-ref_pos', 'ppl-ref_prod', 'ppl-ref_dep', 'ppl-prompt', 'ppl-prompt_pos', 'ppl-correct', 'ppl-correct_pos', 'ppl-correct_prod', 'ppl-correct_dep', 'ppl-ge', 'ppl-ge_pos', 'ppl-incorrect', 'ppl-incorrect_pos', 'ppl-incorrect_prod', 'ppl-incorrect_dep', 'ngram_match', 'ngram_match-lem', 'ngram_unseen-1', 'ngram_unseen-2', 'error_yn', 'error_ratio', 'parse_score-ratio', 'length_ratio', 'length_under-min', 'length_unknown', 'length_unknown-ratio', 'length_sounds', 'length_sounds-ratio', 'prompt_missing', 'prompt_missing-pct', 'prompt_DT', 'prompt_IN', 'prompt_MD', 'prompt_NN', 'prompt_VB', 'edit_distance', 'edit_pattern-DT-OO', 'edit_pattern-OO-DT', 'edit_pattern-OO-IN', 'edit_pattern-OO-NN']
RFECV found 54 features


In [27]:
# convert pandas DF to numpy array

def get_langauge_X(df):
    X = df.loc[:, col_l].values
    return X

def get_langauge_y(df):
    return df['language'].values

def get_meaning_y(df):
    return df['meaning'].values

## z-norm all features

In [28]:
from sklearn.preprocessing.data import StandardScaler
scaler = StandardScaler()

In [29]:
lang_train_X = get_langauge_X(df_train_ml)
lang_train_y = get_langauge_y(df_train_ml)
lang_test_X = get_langauge_X(df_test_ml)
lang_test_y = get_langauge_y(df_test_ml)

scaler.fit(lang_train_X)
lang_train_X = scaler.transform(lang_train_X)
lang_test_X = scaler.transform(lang_test_X)

meaning_train_y = get_meaning_y(df_train_ml)
meaning_test_y = get_meaning_y(df_test_ml)

print(lang_train_X.shape)
print(lang_train_y.shape)
print(lang_test_X.shape)
print(lang_test_y.shape)

(5222, 54)
(5222,)
(995, 54)
(995,)


## Pickle all year17 numpy arrays

In [30]:

with open('../data/processed/numpy/' + pkl_file, 'wb') as pf:
    pickle.dump([lang_train_X,
                 lang_train_y,
                 lang_test_X,
                 lang_test_y,
                 meaning_train_y,
                 meaning_test_y], pf)
