In [11]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

In [12]:
#Importing the training dataset
trainSet  = pd.read_csv('train.csv')
#Encoding the dataset (as per standards)
trainingEncoded = pd.get_dummies(trainSet)
x = trainingEncoded.drop(['hand'], axis=1)
y = trainingEncoded['hand']
#Splitting the datasets to independent training and test splits for later use (randomly)
xTrain, xVal, yTrain, yVal = train_test_split(x, 
                                              y,
                                              test_size=.1,
                                              random_state=12)

In [13]:
# Trying out SMOTE and Tomek Chains for over and under sampling of data
# smt = SMOTETomek(random_state=42, smote=SMOTE(random_state=12, ratio='all', k_neighbors=4))
smt = SMOTE(random_state=12, ratio='all', k_neighbors=4)
xTrain, yTrain = smt.fit_sample(xTrain, yTrain)

In [14]:
#Generate the model for RFE
model = RandomForestClassifier(n_estimators= 140, max_features= 'auto',random_state= 22337, criterion= 'gini')

In [15]:
#Create our classifier
clf = Pipeline([
    ('feature_selection', RFE(model, 5)),
    ('classification', RandomForestClassifier(n_estimators= 140, max_features= 'auto',random_state= 22337, criterion= 'gini'))
])
#Fit the dataset to the classifier
clf.fit(xTrain, yTrain)

Pipeline(memory=None,
     steps=[('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl..._jobs=1,
            oob_score=False, random_state=22337, verbose=0,
            warm_start=False))])

In [16]:
#Doing the prediction (testing split)
smotePred = clf.predict(xVal)

#Labelling the result
pdSmotePred= pd.DataFrame(data=smotePred, columns=['hand'])

#Printing test metrics
print pdSmotePred.hand.value_counts()
print metrics.classification_report(yVal, pdSmotePred)

0    1369
1    1033
2      60
3      25
4       7
6       4
5       3
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.73      0.82      0.78      1220
          1       0.67      0.63      0.65      1088
          2       0.35      0.17      0.23       126
          3       0.44      0.20      0.28        55
          4       0.14      0.17      0.15         6
          5       0.00      0.00      0.00         6
          6       0.00      0.00      0.00         0

avg / total       0.68      0.69      0.68      2501



  'recall', 'true', average, warn_for)


In [18]:
# Trying out SMOTE and Tomek Chains for over and under sampling of data
smtTmk = SMOTETomek(random_state=42, smote=SMOTE(random_state=12, ratio='all', k_neighbors=4))
# smt = SMOTE(random_state=12, ratio='all', k_neighbors=4)
xTrain, yTrain = smtTmk.fit_sample(xTrain, yTrain)

In [19]:
#Create our classifier
clf2 = Pipeline([
    ('feature_selection', RFE(model, 5)),
    ('classification', RandomForestClassifier(n_estimators= 140, max_features= 'auto',random_state= 22337, criterion= 'gini'))
])
#Fit the dataset to the classifier
clf2.fit(xTrain, yTrain)

Pipeline(memory=None,
     steps=[('feature_selection', RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl..._jobs=1,
            oob_score=False, random_state=22337, verbose=0,
            warm_start=False))])

In [20]:
#Doing the prediction (testing split)
smtTmkPred = clf2.predict(xVal)

#Labelling the result
pdSmtTmkPred= pd.DataFrame(data=smtTmkPred, columns=['hand'])

#Printing test metrics
print pdSmtTmkPred.hand.value_counts()
print metrics.classification_report(yVal, pdSmtTmkPred)

0    1399
1     987
2      70
3      28
4      10
5       3
6       3
7       1
Name: hand, dtype: int64
             precision    recall  f1-score   support

          0       0.72      0.82      0.76      1220
          1       0.66      0.60      0.63      1088
          2       0.33      0.18      0.23       126
          3       0.46      0.24      0.31        55
          4       0.20      0.33      0.25         6
          5       0.00      0.00      0.00         6
          6       0.00      0.00      0.00         0
          7       0.00      0.00      0.00         0

avg / total       0.66      0.68      0.66      2501

