In [205]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
%matplotlib inline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix

In [206]:
trainSet  = pd.read_csv('train.csv')
trainingEncoded = pd.get_dummies(trainSet)
x = trainingEncoded.drop(['hand'], axis=1)
y = trainingEncoded['hand']
xTrain, xVal, yTrain, yVal = train_test_split(x, 
                                              y,
                                              test_size=.1,
                                              random_state=12)

In [207]:
# Trying out SMOTE and Tomek Chains for over and under sampling of data
smt = SMOTETomek(random_state=42, smote=SMOTE(random_state=12, ratio='all', k_neighbors=4))
xTrain, yTrain = smt.fit_sample(xTrain, yTrain)

In [208]:
chi2 = SelectKBest(chi2, k=7)
xTrain = chi2.fit_transform(xTrain, yTrain);
xVal = chi2.transform(xVal);

In [209]:
model = RandomForestClassifier(n_estimators=30, random_state=12)

In [213]:
clf = Pipeline([
    ('feature_selection', RFE(model, 5)),
    ('classification', RandomForestClassifier(n_estimators=30))
])
clf.fit(xTrain, yTrain)

Pipeline(memory=None,
     steps=[('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [214]:
pipePredicted = clf.predict(xVal)

pdPipePredicted = pd.DataFrame(data=pipePredicted, columns=['hand'])
print pdPipePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdPipePredicted)

0    1423
1     962
2      72
3      31
4       5
6       4
5       3
9       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.71      0.82      0.76      1220
          1       0.66      0.58      0.62      1088
          2       0.35      0.20      0.25       126
          3       0.45      0.25      0.33        55
          4       0.20      0.17      0.18         6
          5       0.00      0.00      0.00         6
          6       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0

avg / total       0.66      0.67      0.66      2501



In [215]:
# Printout the confusion matrix
print confusion_matrix(yVal, pdPipePredicted)

[[1006  208    3    0    1    2    0    0]
 [ 399  632   38   13    2    1    2    1]
 [  10   85   25    4    1    0    1    0]
 [   2   33    5   14    0    0    1    0]
 [   1    3    1    0    1    0    0    0]
 [   5    1    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]]


In [218]:
# Here on is submission generation, only run if needed (don't run over this point, if on .py file delete this)

testSet  = pd.read_csv('test.csv')
testEncoded = pd.get_dummies(testSet)
testX = testEncoded.drop(['id'], axis=1)
testX = chi2.transform(testX);

In [219]:
resultDataRF = clf.predict(testX)

In [220]:
pdResultDataRF = pd.DataFrame(data=resultDataRF, columns=['hand'])
pdResultDataRF.index += 1
print pdResultDataRF.hand.value_counts()
pdResultDataRF.to_csv("submission.csv", index_label='id', columns=['hand'])

0    574349
1    379248
2     29148
3     11163
4      3251
6      1213
5      1000
9       273
8       182
7       173
Name: hand, dtype: int64
