In [80]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
%matplotlib inline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [67]:
# Generate Training Data
trainSet  = pd.read_csv('train.csv')
trainingEncoded = pd.get_dummies(trainSet)
x = trainingEncoded.drop(['hand'], axis=1)
y = trainingEncoded['hand']
xTrain, xVal, yTrain, yVal = train_test_split(x, 
                                              y,
                                              test_size=.1,
                                              random_state=12)

In [68]:
# Convert to panda data frame for printing contents
pdTraining = pd.DataFrame(data=y, columns=['hand'])
print pdTraining.hand.value_counts()

0    12493
1    10599
2     1206
3      513
4       93
5       54
6       36
7        6
9        5
8        5
Name: hand, dtype: int64


In [69]:
# Print out the testing set contents
pdTest = pd.DataFrame(data=yVal, columns=['hand'])
print pdTest.hand.value_counts()

0    1220
1    1088
2     126
3      55
5       6
4       6
Name: hand, dtype: int64


In [70]:
chi2 = SelectKBest(chi2, k=2)
xChi2Train = chi2.fit_transform(xTrain, yTrain);
xChi2Test = chi2.transform(xVal);

# Make the classifier
model = RandomForestClassifier(n_estimators=30, random_state=12)
model.fit(xChi2Train, yTrain)

predicted = model.predict(xChi2Test)

pdChi2Predicted = pd.DataFrame(data=predicted, columns=['hand'])
print pdChi2Predicted.hand.value_counts()

print metrics.classification_report(yVal, pdChi2Predicted)


0    2221
1     280
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.52      0.95      0.68      1220
          1       0.61      0.16      0.25      1088
          2       0.00      0.00      0.00       126
          3       0.00      0.00      0.00        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.52      0.53      0.44      2501



In [73]:
chi2 = SelectKBest(chi2, k=3)
xChi2Train = chi2.fit_transform(xTrain, yTrain);
xChi2Test = chi2.transform(xVal);

# Make the classifier
model = RandomForestClassifier(n_estimators=30, random_state=12)
model.fit(xChi2Train, yTrain)

predicted = model.predict(xChi2Test)

pdChi2Predicted = pd.DataFrame(data=predicted, columns=['hand'])
print pdChi2Predicted.hand.value_counts()

print metrics.classification_report(yVal, pdChi2Predicted)

0    1598
1     877
2      21
3       4
7       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.60      0.79      0.68      1220
          1       0.56      0.45      0.50      1088
          2       0.19      0.03      0.05       126
          3       0.25      0.02      0.03        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6
          7       0.00      0.00      0.00         0

avg / total       0.55      0.58      0.55      2501



  'recall', 'true', average, warn_for)


In [75]:
chi2 = SelectKBest(chi2, k=4)
xChi2Train = chi2.fit_transform(xTrain, yTrain);
xChi2Test = chi2.transform(xVal);

# Make the classifier
model = RandomForestClassifier(n_estimators=30, random_state=12)
model.fit(xChi2Train, yTrain)

predicted = model.predict(xChi2Test)

pdChi2Predicted = pd.DataFrame(data=predicted, columns=['hand'])
print pdChi2Predicted.hand.value_counts()

print metrics.classification_report(yVal, pdChi2Predicted)

0    1324
1    1069
2      75
3      23
4       7
5       3
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.70      0.75      0.72      1220
          1       0.59      0.58      0.59      1088
          2       0.29      0.17      0.22       126
          3       0.30      0.13      0.18        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.62      0.63      0.62      2501



In [78]:
chi2 = SelectKBest(chi2, k=5)
xChi2Train = chi2.fit_transform(xTrain, yTrain);
xChi2Test = chi2.transform(xVal);

# Make the classifier
model = RandomForestClassifier(n_estimators=30, random_state=12)
model.fit(xChi2Train, yTrain)

predicted = model.predict(xChi2Test)

pdChi2Predicted = pd.DataFrame(data=predicted, columns=['hand'])
print pdChi2Predicted.hand.value_counts()

print metrics.classification_report(yVal, pdChi2Predicted)

0    1450
1    1008
2      33
3       7
4       3
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.70      0.83      0.76      1220
          1       0.65      0.61      0.63      1088
          2       0.45      0.12      0.19       126
          3       0.71      0.09      0.16        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.66      0.68      0.66      2501



In [81]:
chi2 = SelectKBest(chi2, k=6)
xChi2Train = chi2.fit_transform(xTrain, yTrain);
xChi2Test = chi2.transform(xVal);

# Make the classifier
model = RandomForestClassifier(n_estimators=30, random_state=12)
model.fit(xChi2Train, yTrain)

predicted = model.predict(xChi2Test)

pdChi2Predicted = pd.DataFrame(data=predicted, columns=['hand'])
print pdChi2Predicted.hand.value_counts()

print metrics.classification_report(yVal, pdChi2Predicted)

0    1496
1     989
2      14
3       2
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.66      0.81      0.73      1220
          1       0.61      0.56      0.58      1088
          2       0.57      0.06      0.11       126
          3       0.50      0.02      0.04        55
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         6

avg / total       0.63      0.64      0.61      2501

