In [8]:
import json
import pandas as pd
import nltk
import numpy as np
import warnings

from itertools import combinations
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split

In [9]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [10]:
def linear_model(x, y, x_test, y_test):
    reg = LogisticRegression(random_state=42).fit(x,y)
    predictions = reg.predict(x_test)
    print(classification_report(reg.predict(x_test), y_test))
    print(reg.score(x_test, y_test))
    return predictions

In [11]:
def random_forest(x, y, x_test, y_test):
    clf = RandomForestClassifier(n_estimators=400, max_depth=100, random_state=42)
    clf.fit(x, y)
    predictions = clf.predict(x_test)
    print(classification_report(clf.predict(x_test), y_test))
    print(clf.score(x_test, y_test))
    return predictions

In [12]:
x_train = pd.read_csv('./data/x_train.csv')
x_train['error_length'] = x_train['error_length'].fillna(0)
x_test = pd.read_csv('./data/x_test.csv')
y_train = pd.read_csv('./data/y_train.csv')
y_test = pd.read_csv('./data/y_test.csv')

In [13]:
features_dict = json.load(open('./data/feature_dict.json', 'r'))
columns = []
for column in ['error_length', 'error_type_dummies', 'incorrect_ptb_tags_dummies']:
    if column in features_dict:
        columns.extend(features_dict[column])
    else:
        columns.append(column)

In [14]:
rf_predictions = random_forest(x_train[columns], y_train['Negative transfer?'], x_test[columns], y_test['Negative transfer?'])

              precision    recall  f1-score   support

           0       0.73      0.77      0.75       141
           1       0.82      0.79      0.80       187

    accuracy                           0.78       328
   macro avg       0.78      0.78      0.78       328
weighted avg       0.78      0.78      0.78       328

0.7804878048780488


In [15]:
error_type_columns = features_dict['error_type_dummies']

In [16]:
lm_predictions = linear_model(x_train[error_type_columns], y_train['Negative transfer?'], x_test[error_type_columns], y_test['Negative transfer?'])

              precision    recall  f1-score   support

           0       0.65      0.71      0.68       133
           1       0.79      0.73      0.76       195

    accuracy                           0.73       328
   macro avg       0.72      0.72      0.72       328
weighted avg       0.73      0.73      0.73       328

0.725609756097561


In [77]:
def error_type_analysis(x_test, predictions):
    error_types_incorrect = {}
    for index, row in x_test.iterrows():
        if row['error_type'] not in error_types_incorrect:
            error_types_incorrect[row['error_type']] = 0
        if row['Negative transfer?'] != predictions[index]:
            error_types_incorrect[row['error_type']] += 1
    error_type_counts = x_test.groupby('error_type')['error_type'].count()
    error_type_counts = error_type_counts.to_frame().rename(columns={'error_type':'count'}).reset_index()
    error_type_counts['incorrect_percent'] = error_type_counts.apply(
        lambda row: error_types_incorrect[row['error_type']]/row['count'], axis=1)
    return error_type_counts
    
print(error_type_analysis(x_test, rf_predictions))
print(error_type_analysis(x_test, lm_predictions))

   error_type  count  incorrect_percent
0         AGA      3           0.333333
1         AGN      6           0.000000
2         AGV      9           0.000000
3          AS      3           0.333333
4          CN      1           0.000000
5          DA      1           0.000000
6          DD      2           0.000000
7          DJ      9           0.111111
8          DN      1           0.000000
9          DV      1           0.000000
10         DY      2           1.000000
11         FD      2           0.000000
12         FN      7           0.142857
13         FV     11           0.090909
14         ID      2           1.000000
15         IQ      1           0.000000
16         IV      1           0.000000
17          L      2           1.000000
18          M      6           0.333333
19         MA      7           0.285714
20         MC      1           1.000000
21         MD     17           0.000000
22         MN      1           0.000000
23         MP     10           0.000000
