In [21]:
%matplotlib inline
import pickle
import pandas as pd
from IPython.display import display

# Data loading
## Loading 2018 text task data for Id and scores

In [22]:
with open('../data/processed/data.pkl', 'rb') as pf:
    objs = pickle.load(pf)

grammar_dic = objs[0]
df_18_A_train = objs[3]    # using RecResult
df_18_B_train = objs[4]
df_18_C_train = objs[5]

print('A shape {}'.format(df_18_A_train.shape))
print('B shape {}'.format(df_18_B_train.shape))
print('C shape {}'.format(df_18_C_train.shape))

A shape (5526, 8)
B shape (873, 8)
C shape (299, 8)


In [23]:
df_18_train = pd.concat([df_18_A_train, df_18_B_train, df_18_C_train])

recode = lambda x: 1 if x=='correct' else 0
df_18_train['language'] = df_18_train['language'].apply(recode)
df_18_train['meaning'] = df_18_train['meaning'].apply(recode)
print(df_18_train.shape)

(6698, 8)


## Loading Grammar Error Count Features

In [24]:
# grammar error count
train_grmerr = pd.read_csv('../data/processed/df18_train_asr_grmerror.csv')
print('grammar error: train:{} test:{}'.format(train_grmerr.shape, 0))

grammar error: train:(6090, 12) test:0


## Loading Huy's Features

In [25]:
dfA = pd.read_csv('../data/processed/Huy/ASR/scst2_training_data_A_text_features.csv', sep='\t')
dfB = pd.read_csv('../data/processed/Huy/ASR/scst2_training_data_B_text_features.csv', sep='\t')
dfC = pd.read_csv('../data/processed/Huy/ASR/scst2_training_data_C_text_features.csv', sep='\t')

train_huy = pd.concat([dfA, dfB, dfC])
train_huy.rename(columns={'ID' : 'Id'}, inplace=True)
train_huy = train_huy.drop(['CLASS'], axis=1)

print('Huy features train:{}\ttest:{}'.format(train_huy.shape, 0))

Huy features train:(6698, 225)	test:0


# Form a DF for modeling


## Options for pickled file output

In [26]:
# feature selection cutoff
CORR_CUT = 0.2
pkl_file = 'y18_asr_RFECV.pkl'    # 1/29/2018 using Huy's updated features and RFECV(RandomForest)

In [27]:
def gen_ml_df(df_main, df_grmerr, df_huy):
    df_ml = df_main[['Id', 'language', 'meaning']]
    df_ml = pd.merge(df_ml, df_grmerr, on='Id', how='outer')  # grmerr may miss some Ids due to ASR null outputs. use outer merging to keep all Ids.
    df_ml = pd.merge(df_ml, df_huy, on='Id', how='outer')
    df_ml.fillna(0, inplace=True)
    return df_ml

## Process 2018 train DF

In [28]:
df_train_ml = gen_ml_df(df_18_train, train_grmerr, train_huy)
display(df_train_ml.describe())
print(df_train_ml.shape)
print('#NA: {}'.format(df_train_ml.isnull().sum().sum()))

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,edit_pattern-VBZ-IN,edit_pattern-VBZ-MD,edit_pattern-VBZ-NN,edit_pattern-VBZ-OO,edit_pattern-VBZ-VB,edit_pattern-VBZ-VBP,edit_pattern-WDT-OO,edit_pattern-WP-WRB,edit_pattern-WRB-NN,edit_pattern-WRB-OO
count,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,...,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0
mean,3349.5,0.6596,0.822186,0.000149,0.0,0.001792,0.003732,0.0,0.000597,0.021798,...,0.000149,0.000149,0.000597,0.002986,0.0,0.00209,0.000448,0.001045,0.000448,0.002389
std,1933.690384,0.473879,0.382385,0.012219,0.0,0.042292,0.060984,0.0,0.024432,0.146033,...,0.012219,0.012219,0.024432,0.054566,0.0,0.045674,0.02116,0.032313,0.02116,0.04882
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1675.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3349.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5023.75,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6698.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


(6698, 238)
#NA: 0


## Correlation analysis
Only show |R| more than $CORR_{CUT}$

In [29]:
tb_corr = df_train_ml.corr()

tb_l = tb_corr['language'].drop(['language', 'meaning'])
display(tb_l[tb_l > CORR_CUT])
display(tb_l[tb_l < -1 * CORR_CUT])
col_l = list(tb_l[tb_l > CORR_CUT].index) + list(tb_l[tb_l < -1 * CORR_CUT].index)
display(col_l)
len(col_l)

lda_sim-max        0.419308
lda_sim-min        0.242943
lda_sim-avg        0.356028
maxsim_15_skip     0.450217
maxsim_30_skip     0.454153
maxsim_50_skip     0.452030
maxsim_15_cbw      0.456011
maxsim_30_cbw      0.460003
maxsim_50_cbw      0.456979
ppl-ref            0.398008
ppl-correct        0.395582
ppl-ge             0.250824
ngram_match        0.264760
ngram_match-lem    0.256467
length_ratio       0.233891
length_01          0.295596
Name: language, dtype: float64

ngram_unseen-1       -0.491586
ngram_unseen-2       -0.515482
error_yn             -0.203738
error_count          -0.295766
error_ratio          -0.292764
length_under-min     -0.356059
prompt_missing       -0.470177
prompt_missing-pct   -0.413166
prompt_DT            -0.205629
prompt_IN            -0.276814
prompt_NN            -0.366054
prompt_VB            -0.261571
edit_distance        -0.454613
Name: language, dtype: float64

['lda_sim-max',
 'lda_sim-min',
 'lda_sim-avg',
 'maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'ppl-ref',
 'ppl-correct',
 'ppl-ge',
 'ngram_match',
 'ngram_match-lem',
 'length_ratio',
 'length_01',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'error_yn',
 'error_count',
 'error_ratio',
 'length_under-min',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_DT',
 'prompt_IN',
 'prompt_NN',
 'prompt_VB',
 'edit_distance']

29

## Feature Selection

In [30]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier

X = df_train_ml.iloc[:, 3:].values
y = df_train_ml['language'].values

selector = RFECV(estimator=RandomForestClassifier(random_state=2018), cv=5, step=1, verbose=5, n_jobs=-1)
selector.fit(X, y)
feat_names = df_train_ml.columns[3:]
col_l = list(feat_names[selector.get_support()])

Fitting estimator with 235 features.
Fitting estimator with 235 features.
Fitting estimator with 235 features.
Fitting estimator with 235 features.
Fitting estimator with 235 features.
Fitting estimator with 234 features.
Fitting estimator with 234 features.
Fitting estimator with 234 features.
Fitting estimator with 234 features.
Fitting estimator with 234 features.
Fitting estimator with 233 features.
Fitting estimator with 233 features.
Fitting estimator with 233 features.
Fitting estimator with 233 features.
Fitting estimator with 233 features.
Fitting estimator with 232 features.
Fitting estimator with 232 features.
Fitting estimator with 232 features.
Fitting estimator with 232 features.
Fitting estimator with 232 features.
Fitting estimator with 231 features.
Fitting estimator with 231 features.
Fitting estimator with 231 features.
Fitting estimator with 231 features.
Fitting estimator with 231 features.
Fitting estimator with 230 features.
Fitting estimator with 230 features.
F

Fitting estimator with 192 features.
Fitting estimator with 190 features.
Fitting estimator with 192 features.
Fitting estimator with 191 features.
Fitting estimator with 190 features.
Fitting estimator with 189 features.
Fitting estimator with 189 features.
Fitting estimator with 191 features.
Fitting estimator with 190 features.
Fitting estimator with 189 features.
Fitting estimator with 188 features.
Fitting estimator with 188 features.
Fitting estimator with 190 features.
Fitting estimator with 187 features.
Fitting estimator with 188 features.
Fitting estimator with 189 features.
Fitting estimator with 187 features.
Fitting estimator with 189 features.
Fitting estimator with 187 features.
Fitting estimator with 186 features.
Fitting estimator with 188 features.
Fitting estimator with 186 features.
Fitting estimator with 186 features.
Fitting estimator with 185 features.
Fitting estimator with 188 features.
Fitting estimator with 185 features.
Fitting estimator with 187 features.
F

Fitting estimator with 150 features.
Fitting estimator with 148 features.
Fitting estimator with 144 features.
Fitting estimator with 146 features.
Fitting estimator with 143 features.
Fitting estimator with 149 features.
Fitting estimator with 147 features.
Fitting estimator with 143 features.
Fitting estimator with 145 features.
Fitting estimator with 142 features.
Fitting estimator with 148 features.
Fitting estimator with 142 features.
Fitting estimator with 146 features.
Fitting estimator with 144 features.
Fitting estimator with 141 features.
Fitting estimator with 147 features.
Fitting estimator with 141 features.
Fitting estimator with 145 features.
Fitting estimator with 143 features.
Fitting estimator with 140 features.
Fitting estimator with 146 features.
Fitting estimator with 140 features.
Fitting estimator with 144 features.
Fitting estimator with 142 features.
Fitting estimator with 139 features.
Fitting estimator with 145 features.
Fitting estimator with 139 features.
F

Fitting estimator with 107 features.
Fitting estimator with 98 features.
Fitting estimator with 104 features.
Fitting estimator with 102 features.
Fitting estimator with 98 features.
Fitting estimator with 106 features.
Fitting estimator with 97 features.
Fitting estimator with 97 features.
Fitting estimator with 103 features.
Fitting estimator with 101 features.
Fitting estimator with 105 features.
Fitting estimator with 96 features.
Fitting estimator with 96 features.
Fitting estimator with 100 features.
Fitting estimator with 102 features.
Fitting estimator with 104 features.
Fitting estimator with 95 features.
Fitting estimator with 95 features.
Fitting estimator with 101 features.
Fitting estimator with 99 features.
Fitting estimator with 103 features.
Fitting estimator with 94 features.
Fitting estimator with 94 features.
Fitting estimator with 98 features.
Fitting estimator with 100 features.
Fitting estimator with 102 features.
Fitting estimator with 93 features.
Fitting estima

Fitting estimator with 63 features.
Fitting estimator with 51 features.
Fitting estimator with 57 features.
Fitting estimator with 51 features.
Fitting estimator with 59 features.
Fitting estimator with 62 features.
Fitting estimator with 50 features.
Fitting estimator with 56 features.
Fitting estimator with 50 features.
Fitting estimator with 58 features.
Fitting estimator with 61 features.
Fitting estimator with 49 features.
Fitting estimator with 55 features.
Fitting estimator with 49 features.
Fitting estimator with 57 features.
Fitting estimator with 60 features.
Fitting estimator with 48 features.
Fitting estimator with 54 features.
Fitting estimator with 48 features.
Fitting estimator with 56 features.
Fitting estimator with 47 features.
Fitting estimator with 59 features.
Fitting estimator with 53 features.
Fitting estimator with 47 features.
Fitting estimator with 55 features.
Fitting estimator with 46 features.
Fitting estimator with 58 features.
Fitting estimator with 52 fe

Fitting estimator with 2 features.
Fitting estimator with 3 features.
Fitting estimator with 16 features.
Fitting estimator with 13 features.
Fitting estimator with 2 features.
Fitting estimator with 19 features.
Fitting estimator with 12 features.
Fitting estimator with 15 features.
Fitting estimator with 18 features.
Fitting estimator with 11 features.
Fitting estimator with 14 features.
Fitting estimator with 17 features.
Fitting estimator with 10 features.
Fitting estimator with 13 features.
Fitting estimator with 9 features.
Fitting estimator with 12 features.
Fitting estimator with 16 features.
Fitting estimator with 8 features.
Fitting estimator with 11 features.
Fitting estimator with 15 features.
Fitting estimator with 7 features.
Fitting estimator with 10 features.
Fitting estimator with 14 features.
Fitting estimator with 6 features.
Fitting estimator with 9 features.
Fitting estimator with 5 features.
Fitting estimator with 13 features.
Fitting estimator with 8 features.
Fi

In [31]:
print(col_l)
print('RFECV found {} features'.format(len(col_l)))

['noun_error_count', 'Article_error_count', 'Wordform_error_count', 'VerbCollocation_error_count', 'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'maxsim_15_skip', 'maxsim_30_skip', 'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw', 'ppl-ref', 'ppl-ref_pos', 'ppl-ref_prod', 'ppl-ref_dep', 'ppl-prompt', 'ppl-prompt_pos', 'ppl-correct', 'ppl-correct_pos', 'ppl-correct_prod', 'ppl-correct_dep', 'ppl-ge', 'ppl-ge_pos', 'ppl-incorrect', 'ppl-incorrect_pos', 'ppl-incorrect_prod', 'ppl-incorrect_dep', 'ngram_match', 'ngram_match-lem', 'ngram_unseen-1', 'ngram_unseen-2', 'error_yn', 'error_count', 'error_ratio', 'parse_score-ratio', 'length_ratio', 'length_under-min', 'length_above-max', 'length_sounds', 'length_sounds-ratio', 'prompt_missing', 'prompt_missing-pct', 'prompt_DT', 'prompt_IN', 'prompt_MD', 'prompt_NN', 'prompt_VB', 'edit_distance', 'edit_pattern-CD-JJ', 'edit_pattern-CD-OO', 'edit_pattern-DT-JJ', 'edit_pattern-DT-NN', 'edit_pattern-DT-OO', 'edit_pattern-FW-LS', '

In [32]:
# convert pandas DF to numpy array

def get_langauge_X(df):
    X = df.loc[:, col_l].values
    return X

def get_langauge_y(df):
    return df['language'].values

def get_meaning_y(df):
    return df['meaning'].values

def get_both_y(df):
    return df.loc[:,['meaning', 'language']].values

In [33]:
# Split entire train set to train and dev

In [34]:
from sklearn.model_selection import train_test_split

X = get_langauge_X(df_train_ml)
y = get_both_y(df_train_ml)

# using seed 0 will generate same split between text and asr tasks
lang_train_X, lang_test_X, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# from two labels to one label
lang_train_y = y_train[:,1]
meaning_train_y = y_train[:,0]
lang_test_y = y_test[:,1]
meaning_test_y = y_test[:,0]

print(lang_train_X.shape)
print(lang_train_y.shape)

print(lang_test_X.shape)
print(lang_test_y.shape)

print(meaning_train_y.shape)
print(meaning_test_y.shape)

(5358, 95)
(5358,)
(1340, 95)
(1340,)
(5358,)
(1340,)


## z-norm all features

In [35]:
from sklearn.preprocessing.data import StandardScaler
scaler = StandardScaler()

In [36]:
scaler.fit(lang_train_X)
lang_train_X = scaler.transform(lang_train_X)
lang_test_X = scaler.transform(lang_test_X)

print(lang_train_X.shape)
print(lang_train_y.shape)
print(lang_test_X.shape)
print(lang_test_y.shape)

(5358, 95)
(5358,)
(1340, 95)
(1340,)


## Pickle all year18 numpy arrays

In [37]:

with open('../data/processed/numpy/' + pkl_file, 'wb') as pf:
    pickle.dump([lang_train_X,
                 lang_train_y,
                 lang_test_X,
                 lang_test_y,
                 meaning_train_y,
                 meaning_test_y], pf)
