In [25]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
%matplotlib inline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

In [3]:
# Generate Training Data
trainSet  = pd.read_csv('train.csv')
trainingEncoded = pd.get_dummies(trainSet)
x = trainingEncoded.drop(['hand'], axis=1)
y = trainingEncoded['hand']
xTrain, xVal, yTrain, yVal = train_test_split(x, 
                                              y,
                                              test_size=.1,
                                              random_state=12)

In [4]:
# Convert to panda data frame for printing contents
pdTraining = pd.DataFrame(data=yTrain, columns=['hand'])
print pdTraining.hand.value_counts()

0    11273
1     9511
2     1080
3      458
4       87
5       48
6       36
7        6
9        5
8        5
Name: hand, dtype: int64


In [5]:
# Print out the testing set contents
pdTest = pd.DataFrame(data=yVal, columns=['hand'])
print pdTest.hand.value_counts()

0    1220
1    1088
2     126
3      55
5       6
4       6
Name: hand, dtype: int64


In [6]:
# Make the classifier
model = RandomForestClassifier(n_estimators=30, random_state=12)

In [7]:
# Fit the model with training data and do benchmarking with different number of attributes
rfe = RFE(model, 2)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    2170
1     331
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.51      0.91      0.66      1220
          1       0.56      0.17      0.26      1088
          2       0.00      0.00      0.00       126
          3       0.00      0.00      0.00        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.49      0.52      0.43      2501



  'precision', 'predicted', average, warn_for)


In [8]:
rfe = RFE(model, 3)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1628
1     846
2      22
3       5
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.60      0.80      0.68      1220
          1       0.56      0.44      0.49      1088
          2       0.14      0.02      0.04       126
          3       1.00      0.09      0.17        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.56      0.58      0.55      2501



In [9]:
rfe = RFE(model, 4)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1357
1    1047
2      64
3      25
4       5
5       2
6       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.69      0.77      0.73      1220
          1       0.60      0.58      0.59      1088
          2       0.27      0.13      0.18       126
          3       0.44      0.20      0.28        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6
          6       0.00      0.00      0.00         0

avg / total       0.62      0.64      0.63      2501



  'recall', 'true', average, warn_for)


In [10]:
rfe = RFE(model, 5)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1450
1    1008
2      33
3       7
4       3
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.70      0.83      0.76      1220
          1       0.65      0.61      0.63      1088
          2       0.45      0.12      0.19       126
          3       0.71      0.09      0.16        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.66      0.68      0.66      2501



In [11]:
rfe = RFE(model, 6)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1441
1    1038
2      18
3       3
4       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.68      0.80      0.73      1220
          1       0.62      0.59      0.60      1088
          2       0.44      0.06      0.11       126
          3       1.00      0.05      0.10        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.64      0.65      0.63      2501



In [12]:
rfe = RFE(model, 7)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1475
1    1018
2       6
3       2
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.65      0.78      0.71      1220
          1       0.59      0.55      0.57      1088
          2       0.50      0.02      0.05       126
          3       1.00      0.04      0.07        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.62      0.62      0.60      2501



In [13]:
rfe = RFE(model, 8)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1539
1     958
3       2
2       2
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.60      0.75      0.67      1220
          1       0.54      0.48      0.51      1088
          2       0.50      0.01      0.02       126
          3       0.50      0.02      0.04        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.56      0.57      0.55      2501



In [14]:
rfe = RFE(model, 9)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1534
1     962
2       4
3       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.62      0.78      0.69      1220
          1       0.57      0.50      0.53      1088
          2       0.50      0.02      0.03       126
          3       1.00      0.02      0.04        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.60      0.60      0.57      2501



In [15]:
rfe = RFE(model, 10)
rfe = rfe.fit(xTrain, yTrain)
# print rfe.support_
# print rfe.ranking_
rfePredicted = rfe.predict(xVal)

pdRfePredicted = pd.DataFrame(data=rfePredicted, columns=['hand'])
print pdRfePredicted.hand.value_counts()

print metrics.classification_report(yVal, pdRfePredicted)

0    1544
1     951
2       5
3       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.60      0.75      0.67      1220
          1       0.55      0.48      0.52      1088
          2       0.60      0.02      0.05       126
          3       1.00      0.02      0.04        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.58      0.58      0.55      2501



In [16]:
# Create a new classifier, using pipeline
# Using RFE for feature selection and final classification model using RF
clf = Pipeline([
    ('feature_selection', RFE(model, 6)),
    ('classification', RandomForestClassifier(n_estimators=30))
])
clf.fit(xTrain, yTrain)

Pipeline(memory=None,
     steps=[('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [17]:
# Predict the result
pipePredicted = clf.predict(xVal)

pdPipePredicted = pd.DataFrame(data=pipePredicted, columns=['hand'])
print pdPipePredicted.hand.value_counts()

# Print out the score report
print metrics.classification_report(yVal, pdPipePredicted)

0    1449
1    1036
2      14
3       1
4       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.67      0.80      0.73      1220
          1       0.61      0.58      0.60      1088
          2       0.36      0.04      0.07       126
          3       1.00      0.02      0.04        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.63      0.65      0.62      2501



In [18]:
# Load up rebalanced dataset
balancedSet  = pd.read_csv('balence.csv')
encodedBalanced = pd.get_dummies(balancedSet)
balancedX = encodedBalanced.drop(['hand'], axis=1)
balancedY = encodedBalanced['hand']
xBalTrain, xBalVal, yBalTrain, yBalVal = train_test_split(balancedX, 
                                              balancedY,
                                              test_size=.1,
                                              random_state=12)


In [19]:
# Print out contents of dataset
pdBalanced = pd.DataFrame(data=yBalTrain, columns=['hand'])
print pdBalanced.hand.value_counts()

5    11275
3    11273
8    11270
1    11265
9    11244
0    11243
2    11227
6    11226
7    11220
4    11194
Name: hand, dtype: int64


In [20]:
# Fit the training data to the pipeline, replacing previous model
clf.fit(xBalTrain, yBalTrain)

Pipeline(memory=None,
     steps=[('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [21]:
# Predict the result of the test set
balPredicted = clf.predict(xBalVal)

pdBalPredicted = pd.DataFrame(data=balPredicted, columns=['hand'])
print pdBalPredicted.hand.value_counts()

print metrics.classification_report(yBalVal, pdBalPredicted)

6    1536
7    1374
0    1353
8    1338
9    1265
4    1243
5    1204
3    1144
2    1109
1     927
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.41      0.44      0.43      1250
          1       0.41      0.31      0.36      1228
          2       0.69      0.60      0.64      1266
          3       0.76      0.71      0.73      1220
          4       0.52      0.50      0.51      1299
          5       0.43      0.42      0.42      1218
          6       0.80      0.97      0.87      1267
          7       0.93      1.00      0.96      1273
          8       0.49      0.53      0.51      1223
          9       0.99      1.00      0.99      1249

avg / total       0.64      0.65      0.64     12493



In [22]:
# Printout the confusion matrix
print confusion_matrix(yBalVal, pdBalPredicted)

[[ 554  153   15   15    3  507    0    0    3    0]
 [ 236  384  257  154   13  160    4    0   17    3]
 [  24  172  760   87    4   18  178   15    4    4]
 [  10   81   32  865    3    5  130   87    1    6]
 [   0    0    0    0  646    0    0    0  652    1]
 [ 529  137   15    9    5  514    0    0    7    2]
 [   0    0   30   13    0    0 1223    1    0    0]
 [   0    0    0    1    0    0    1 1271    0    0]
 [   0    0    0    0  569    0    0    0  654    0]
 [   0    0    0    0    0    0    0    0    0 1249]]


In [31]:
# Trying out SMOTE and Tomek Chains for over and under sampling of data
smt = SMOTETomek(random_state=42, smote=SMOTE(random_state=12, ratio='minority', k_neighbors=10))
resX, resY = smt.fit_sample(xBalTrain, yBalTrain)

In [32]:
# Print out the resulting training dataset values
pdBalanced = pd.DataFrame(data=resY, columns=['hand'])
print pdBalanced.hand.value_counts()

9    11244
8    11180
5    10635
4    10569
7    10447
6     9557
0     9028
3     8949
2     8660
1     8265
Name: hand, dtype: int64


In [33]:
# Fit the SMOTETomek rebalanced data
clf.fit(resX, resY)

Pipeline(memory=None,
     steps=[('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [31]:
# Predict results with new model
balPredicted = clf.predict(xBalVal)

pdBalPredicted = pd.DataFrame(data=balPredicted, columns=['hand'])
print pdBalPredicted.hand.value_counts()

print metrics.classification_report(yBalVal, pdBalPredicted)

8    1615
6    1526
5    1426
7    1326
9    1254
0    1215
3    1206
2    1111
4     950
1     864
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.42      0.40      0.41      1250
          1       0.41      0.29      0.34      1228
          2       0.72      0.63      0.67      1266
          3       0.77      0.76      0.77      1220
          4       0.54      0.39      0.46      1299
          5       0.41      0.48      0.45      1218
          6       0.83      1.00      0.91      1267
          7       0.96      1.00      0.98      1273
          8       0.50      0.66      0.57      1223
          9       1.00      1.00      1.00      1249

avg / total       0.66      0.66      0.66     12493



In [15]:
# Here on is submission generation, only run if needed (don't run over this point, if on .py file delete this)

testSet  = pd.read_csv('test.csv')
testEncoded = pd.get_dummies(testSet)
testX = testEncoded.drop(['id'], axis=1)

In [17]:
resultDataRF = clf.predict(testX)

In [19]:
pdResultDataRF = pd.DataFrame(data=resultDataRF, columns=['hand'])
pdResultDataRF.index += 1
print pdResultDataRF.hand.value_counts()
pdResultDataRF.to_csv("submission.csv", index_label='id', columns=['hand'])

0    578784
1    405665
2     11129
3      3184
4       792
5       242
6       123
8        44
9        32
7         5
Name: hand, dtype: int64
