In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('./dataframe.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df = df[df.language != 'Dutch']

In [3]:
list(df)

['head',
 'language',
 'score',
 'exam_score',
 'line',
 'error_position',
 'error_type',
 'correct_sentence',
 'error_trigram',
 'correct_trigram',
 'error_bigram',
 'correct_bigram',
 'error_pos',
 'correct_pos']

In [4]:
df.dtypes

head                 object
language             object
score               float64
exam_score           object
line                 object
error_position        int64
error_type           object
correct_sentence     object
error_trigram        object
correct_trigram      object
error_bigram         object
correct_bigram       object
error_pos            object
correct_pos          object
dtype: object

In [5]:
def concat_dummies(df, dummies):
    for dummy in list(dummies):
        df[dummy] = dummies[dummy]

In [6]:
def merge_wrong_corrected_columns(df, wrong_column, corrected_column):
    wrong = pd.get_dummies(df[wrong_column])
    corrected = pd.get_dummies(df[corrected_column])
    for wc in list(set(wrong) - set(corrected)):
        corrected[wc] = 0
    for cc in list(set(corrected) - set(wrong)):
        wrong[cc] = 0
    wrong = wrong.where(wrong == 1, -100)
    wrong = wrong.where(wrong == -100, -1)
    corrected = corrected.where(corrected != 1, 101)
    merged = wrong.add(corrected)
    merged = merged.where(merged != 100, 0)
    return merged

In [7]:
languages = pd.get_dummies(df['language'], prefix='lang')

In [8]:
merged_pos = merge_wrong_corrected_columns(df, 'error_pos', 'correct_pos')
merged_bigrams = merge_wrong_corrected_columns(df, 'error_bigram', 'correct_bigram')
merged_trigrams = merge_wrong_corrected_columns(df, 'error_trigram', 'correct_trigram')
concat_dummies(df, merged_pos)
concat_dummies(df, merged_bigrams)
concat_dummies(df, merged_trigrams)
concat_dummies(df, languages)
df.describe()

Unnamed: 0,score,error_position,*,CC,CD,DT,EX,FW,IN,JJ,...,lang_Italian,lang_Japanese,lang_Korean,lang_Polish,lang_Portuguese,lang_Russian,lang_Spanish,lang_Swedish,lang_Thai,lang_Turkish
count,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,...,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0,45268.0
mean,26.375762,32.776398,-98.700473,-97.995803,-99.531501,-83.82206,-99.374812,-99.977865,-81.143413,-91.333459,...,0.053901,0.055072,0.05114,0.062517,0.052885,0.073606,0.160577,0.009808,0.06097,0.064549
std,5.52249,33.920929,11.328355,14.032864,6.83035,36.888335,7.896733,1.489188,39.121069,28.15599,...,0.225825,0.228123,0.220285,0.242094,0.223806,0.261132,0.367144,0.098551,0.239278,0.245731
min,0.0,0.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,10.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,22.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,30.0,43.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40.0,341.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
y = df['error_type']
data = df[list(languages) + list(merged_pos) + list(merged_bigrams) 
        + list(merged_trigrams) + ['score']]

In [10]:
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.15)

In [None]:
depth = []
for i in range(3,20):
    dtc = DecisionTreeClassifier(max_depth=i)
    scores = cross_val_score(estimator=dtc, X=data, y=y, cv=10, n_jobs=4)
    depth.append((i, scores.mean()))
print(depth)

In [15]:
dtc = DecisionTreeClassifier(random_state=0)
cross_val_score(dtc, data, y, cv=10)



array([0.52796666, 0.54901099, 0.55183799, 0.54230939, 0.54603665,
       0.55275382, 0.53566681, 0.54634146, 0.53383625, 0.53680231])

In [16]:
dtc.fit(x_train, y_train)
dtc.score(x_test, y_test)

0.5632454719481667

In [17]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(x_train, y_train)
neigh.score(x_test, y_test)

0.5093506111029303

In [18]:
clf = RandomForestClassifier(n_estimators=500, max_depth=100, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [19]:
clf.score(x_test, y_test)

0.6115446915034605

In [13]:
print(clf.feature_importances_)

[6.12577924e-03 6.64447558e-03 9.97879323e-03 ... 1.28302513e-05
 7.84922654e-06 8.72163282e-02]


In [21]:
data

Unnamed: 0,lang_Catalan,lang_Chinese,lang_French,lang_German,lang_Greek,lang_Italian,lang_Japanese,lang_Korean,lang_Polish,lang_Portuguese,...,_ WRB EX,_ WRB JJ,_ WRB MD,_ WRB NN,_ WRB PRP,_ WRB PRP$,_ WRB RB,_ WRB TO,_ WRB VBP,score
0,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
1,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
2,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
3,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
4,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
5,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
6,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
7,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
8,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
9,0,0,0,0,0,0,0,0,0,0,...,-100,-100,-100,-100,-100,-100,-100,-100,-100,25.0
