In [41]:
%matplotlib inline
import pickle
import pandas as pd
from IPython.display import display

# Data loading
## Loading 2018 text task data for Id and scores

In [42]:
with open('../data/processed/data.pkl', 'rb') as pf:
    objs = pickle.load(pf)

grammar_dic = objs[0]
df_18_A_train = objs[3]    # using RecResult
df_18_B_train = objs[4]
df_18_C_train = objs[5]

print('A shape {}'.format(df_18_A_train.shape))
print('B shape {}'.format(df_18_B_train.shape))
print('C shape {}'.format(df_18_C_train.shape))

A shape (5526, 8)
B shape (873, 8)
C shape (299, 8)


In [43]:
df_18_train = pd.concat([df_18_A_train, df_18_B_train, df_18_C_train])

recode = lambda x: 1 if x=='correct' else 0
df_18_train['language'] = df_18_train['language'].apply(recode)
df_18_train['meaning'] = df_18_train['meaning'].apply(recode)
print(df_18_train.shape)

(6698, 8)


## Loading Grammar Error Count Features

In [44]:
# grammar error count
dfA = pd.read_csv('../data/processed/df18_A_train_grmerror.csv')
dfB = pd.read_csv('../data/processed/df18_B_train_grmerror.csv')
dfC = pd.read_csv('../data/processed/df18_C_train_grmerror.csv')
train_grmerr = pd.concat([dfA, dfB, dfC])
print('grammar error: train:{} test:{}'.format(train_grmerr.shape, 0))

grammar error: train:(6410, 12) test:0


## Loading Huy's Features

In [45]:
dfA = pd.read_csv('../data/processed/Huy/scst2_training_data_A_text_features.csv', sep='\t')
dfB = pd.read_csv('../data/processed/Huy/scst2_training_data_B_text_features.csv', sep='\t')
dfC = pd.read_csv('../data/processed/Huy/scst2_training_data_C_text_features.csv', sep='\t')

train_huy = pd.concat([dfA, dfB, dfC])
train_huy.rename(columns={'ID' : 'Id'}, inplace=True)
train_huy = train_huy.drop(['CLASS'], axis=1)

print('Huy features train:{}\ttest:{}'.format(train_huy.shape, 0))

Huy features train:(6671, 195)	test:0


# Form a DF for modeling


## Options for pickled file output

In [46]:
# feature selection cutoff
CORR_CUT = 0.2
pkl_file = 'y18_text_RFECV.pkl'    # 1/29/2018 using Huy's updated features and RFECV(RandomForest)

In [47]:
def gen_ml_df(df_main, df_grmerr, df_huy):
    df_ml = df_main[['Id', 'language', 'meaning']]
    df_ml = pd.merge(df_ml, df_grmerr, on='Id', how='outer')  # grmerr may miss some Ids due to ASR null outputs. use outer merging to keep all Ids.
    df_ml = pd.merge(df_ml, df_huy, on='Id', how='outer')
    df_ml.fillna(0, inplace=True)
    return df_ml

## Process 2018 train DF

In [48]:
df_train_ml = gen_ml_df(df_18_train, train_grmerr, train_huy)
display(df_train_ml.describe())
print(df_train_ml.shape)
print('#NA: {}'.format(df_train_ml.isnull().sum().sum()))

Unnamed: 0,Id,language,meaning,ofto_error_count,Spell,Article_a_an_error_count,sva_error_count,RuleRepeatedLinkWords_error_count,Verbform_error_count,noun_error_count,...,length_unknown-ratio,length_sounds,length_sounds-ratio,prompt_missing,prompt_missing-pct,prompt_DT,prompt_IN,prompt_MD,prompt_NN,prompt_VB
count,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,...,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0,6698.0
mean,3349.5,0.6596,0.822186,0.000299,0.0,0.001194,0.015826,0.0,0.000597,0.023589,...,0.0,4.984324,0.907435,1.480591,0.325643,0.172887,0.093759,0.075694,0.331442,0.265303
std,1933.690384,0.473879,0.382385,0.017279,0.0,0.034542,0.12481,0.0,0.024432,0.153732,...,0.0,2.495729,0.289843,1.737344,0.360292,0.378178,0.291515,0.264528,0.470767,0.441527
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1675.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3349.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,1.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0
75%,5023.75,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,1.0,2.0,0.5,0.0,0.0,0.0,1.0,1.0
max,6698.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,...,0.0,89.0,1.0,9.0,1.0,1.0,1.0,1.0,1.0,1.0


(6698, 208)
#NA: 0


## Correlation analysis
Only show |R| more than $CORR_{CUT}$

In [49]:
tb_corr = df_train_ml.corr()

tb_l = tb_corr['language'].drop(['language', 'meaning'])
display(tb_l[tb_l > CORR_CUT])
display(tb_l[tb_l < -1 * CORR_CUT])
col_l = list(tb_l[tb_l > CORR_CUT].index) + list(tb_l[tb_l < -1 * CORR_CUT].index)
display(col_l)
len(col_l)

ppl-ref                0.380749
ppl-correct            0.379440
ppl-correct_pos        0.315042
ppl-ge                 0.235393
maxsim_15_skip         0.451242
maxsim_30_skip         0.454551
maxsim_50_skip         0.451025
maxsim_15_cbw          0.456418
maxsim_30_cbw          0.460333
maxsim_50_cbw          0.456142
lda_sim-max            0.419830
lda_sim-min            0.244492
lda_sim-avg            0.359437
ngram_match            0.269970
ngram_match-lem        0.261940
length_ratio           0.248441
length_01              0.306777
length_sounds-ratio    0.246730
Name: language, dtype: float64

edit_distance        -0.452037
ngram_unseen-1       -0.479919
ngram_unseen-2       -0.489517
error_yn             -0.210882
error_count          -0.296754
error_ratio          -0.285889
length_under-min     -0.349375
prompt_missing       -0.469614
prompt_missing-pct   -0.410011
prompt_IN            -0.272637
prompt_NN            -0.371023
prompt_VB            -0.263420
Name: language, dtype: float64

['ppl-ref',
 'ppl-correct',
 'ppl-correct_pos',
 'ppl-ge',
 'maxsim_15_skip',
 'maxsim_30_skip',
 'maxsim_50_skip',
 'maxsim_15_cbw',
 'maxsim_30_cbw',
 'maxsim_50_cbw',
 'lda_sim-max',
 'lda_sim-min',
 'lda_sim-avg',
 'ngram_match',
 'ngram_match-lem',
 'length_ratio',
 'length_01',
 'length_sounds-ratio',
 'edit_distance',
 'ngram_unseen-1',
 'ngram_unseen-2',
 'error_yn',
 'error_count',
 'error_ratio',
 'length_under-min',
 'prompt_missing',
 'prompt_missing-pct',
 'prompt_IN',
 'prompt_NN',
 'prompt_VB']

30

## Feature Selection

In [50]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier

X = df_train_ml.iloc[:, 3:].values
y = df_train_ml['language'].values

selector = RFECV(estimator=RandomForestClassifier(random_state=2018), cv=5, step=1, verbose=5, n_jobs=-1)
selector.fit(X, y)
feat_names = df_train_ml.columns[3:]
col_l = list(feat_names[selector.get_support()])

Fitting estimator with 205 features.
Fitting estimator with 205 features.
Fitting estimator with 205 features.
Fitting estimator with 205 features.
Fitting estimator with 205 features.
Fitting estimator with 204 features.
Fitting estimator with 204 features.
Fitting estimator with 204 features.
Fitting estimator with 204 features.
Fitting estimator with 204 features.
Fitting estimator with 203 features.
Fitting estimator with 203 features.
Fitting estimator with 203 features.
Fitting estimator with 203 features.
Fitting estimator with 203 features.
Fitting estimator with 202 features.
Fitting estimator with 202 features.
Fitting estimator with 202 features.
Fitting estimator with 202 features.
Fitting estimator with 202 features.
Fitting estimator with 201 features.
Fitting estimator with 201 features.
Fitting estimator with 201 features.
Fitting estimator with 201 features.
Fitting estimator with 201 features.
Fitting estimator with 200 features.
Fitting estimator with 200 features.
F

Fitting estimator with 162 features.
Fitting estimator with 161 features.
Fitting estimator with 160 features.
Fitting estimator with 159 features.
Fitting estimator with 161 features.
Fitting estimator with 160 features.
Fitting estimator with 159 features.
Fitting estimator with 161 features.
Fitting estimator with 158 features.
Fitting estimator with 160 features.
Fitting estimator with 158 features.
Fitting estimator with 159 features.
Fitting estimator with 160 features.
Fitting estimator with 157 features.
Fitting estimator with 157 features.
Fitting estimator with 158 features.
Fitting estimator with 159 features.
Fitting estimator with 159 features.
Fitting estimator with 156 features.
Fitting estimator with 156 features.
Fitting estimator with 157 features.
Fitting estimator with 158 features.
Fitting estimator with 158 features.
Fitting estimator with 155 features.
Fitting estimator with 155 features.
Fitting estimator with 156 features.
Fitting estimator with 154 features.
F

Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 117 features.
Fitting estimator with 118 features.
Fitting estimator with 119 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 116 features.
Fitting estimator with 117 features.
Fitting estimator with 118 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 115 features.
Fitting estimator with 116 features.
Fitting estimator with 117 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 114 features.
Fitting estimator with 115 features.
Fitting estimator with 116 features.
Fitting estimator with 109 features.
Fitting estimator with 110 features.
Fitting estimator with 113 features.
Fitting estimator with 114 features.
Fitting estimator with 115 features.
Fitting estimator with 108 features.
Fitting estimator with 109 features.
F

Fitting estimator with 76 features.
Fitting estimator with 67 features.
Fitting estimator with 74 features.
Fitting estimator with 72 features.
Fitting estimator with 66 features.
Fitting estimator with 75 features.
Fitting estimator with 66 features.
Fitting estimator with 73 features.
Fitting estimator with 71 features.
Fitting estimator with 65 features.
Fitting estimator with 74 features.
Fitting estimator with 65 features.
Fitting estimator with 72 features.
Fitting estimator with 70 features.
Fitting estimator with 64 features.
Fitting estimator with 73 features.
Fitting estimator with 64 features.
Fitting estimator with 71 features.
Fitting estimator with 69 features.
Fitting estimator with 63 features.
Fitting estimator with 72 features.
Fitting estimator with 63 features.
Fitting estimator with 70 features.
Fitting estimator with 68 features.
Fitting estimator with 62 features.
Fitting estimator with 71 features.
Fitting estimator with 62 features.
Fitting estimator with 69 fe

Fitting estimator with 30 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 27 features.
Fitting estimator with 31 features.
Fitting estimator with 29 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 26 features.
Fitting estimator with 30 features.
Fitting estimator with 28 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 25 features.
Fitting estimator with 29 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 27 features.
Fitting estimator with 24 features.
Fitting estimator with 15 features.
Fitting estimator with 28 features.
Fitting estimator with 16 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Fitting estimator with 14 features.
Fitting estimator with 15 features.
Fitting estimator with 27 features.
Fitting estimator with 25 fe

In [51]:
print(col_l)
print('RFECV found {} features'.format(len(col_l)))

['noun_error_count', 'Article_error_count', 'AdjAdv_error_count', 'Wordform_error_count', 'VerbCollocation_error_count', 'ppl-ref', 'ppl-ref_pos', 'ppl-ref_prod', 'ppl-ref_dep', 'ppl-prompt', 'ppl-prompt_pos', 'ppl-correct', 'ppl-correct_pos', 'ppl-correct_prod', 'ppl-correct_dep', 'ppl-ge', 'ppl-ge_pos', 'ppl-incorrect', 'ppl-incorrect_pos', 'ppl-incorrect_prod', 'ppl-incorrect_dep', 'maxsim_15_skip', 'maxsim_30_skip', 'maxsim_50_skip', 'maxsim_15_cbw', 'maxsim_30_cbw', 'maxsim_50_cbw', 'lda_sim-max', 'lda_sim-min', 'lda_sim-avg', 'edit_distance', 'edit_pattern-CD-DT', 'edit_pattern-CD-JJ', 'edit_pattern-CD-NN', 'edit_pattern-CD-OO', 'edit_pattern-DT-CD', 'edit_pattern-DT-JJ', 'edit_pattern-DT-NN', 'edit_pattern-DT-NNS', 'edit_pattern-DT-OO', 'edit_pattern-DT-PRP', 'edit_pattern-DT-TO', 'edit_pattern-FW-LS', 'edit_pattern-FW-OO', 'edit_pattern-IN-NN', 'edit_pattern-IN-OO', 'edit_pattern-JJ-NN', 'edit_pattern-JJ-NNS', 'edit_pattern-JJ-OO', 'edit_pattern-JJ-VBZ', 'edit_pattern-JJR-OO', 

In [52]:
# convert pandas DF to numpy array

def get_langauge_X(df):
    X = df.loc[:, col_l].values
    return X

def get_langauge_y(df):
    return df['language'].values

def get_meaning_y(df):
    return df['meaning'].values

def get_both_y(df):
    return df.loc[:,['meaning', 'language']].values

In [53]:
# Split entire train set to train and dev

In [54]:
from sklearn.model_selection import train_test_split

X = get_langauge_X(df_train_ml)
y = get_both_y(df_train_ml)
# using seed 0 will generate same split between text and asr tasks
lang_train_X, lang_test_X, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# from two labels to one label
lang_train_y = y_train[:,1]
meaning_train_y = y_train[:,0]
lang_test_y = y_test[:,1]
meaning_test_y = y_test[:,0]

print(lang_train_X.shape)
print(lang_train_y.shape)

print(lang_test_X.shape)
print(lang_test_y.shape)

print(meaning_train_y.shape)
print(meaning_test_y.shape)

(5358, 125)
(5358,)
(1340, 125)
(1340,)
(5358,)
(1340,)


## z-norm all features

In [55]:
from sklearn.preprocessing.data import StandardScaler
scaler = StandardScaler()

In [56]:
scaler.fit(lang_train_X)
lang_train_X = scaler.transform(lang_train_X)
lang_test_X = scaler.transform(lang_test_X)

print(lang_train_X.shape)
print(lang_train_y.shape)
print(lang_test_X.shape)
print(lang_test_y.shape)

(5358, 125)
(5358,)
(1340, 125)
(1340,)


## Pickle all year18 numpy arrays

In [57]:

with open('../data/processed/numpy/' + pkl_file, 'wb') as pf:
    pickle.dump([lang_train_X,
                 lang_train_y,
                 lang_test_X,
                 lang_test_y,
                 meaning_train_y,
                 meaning_test_y], pf)
