In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

%matplotlib inline 

# define project path
projectpath = 'projectpath'
train_data_csv = projectpath + 'train.csv'
test_data_csv = projectpath + 'test.csv'

df_train = pd.read_csv(train_data_csv,header=None)

# save target values with index into new dataframe 'labels'
df_labels = pd.read_csv(train_data_csv, names=['target'], header=None, usecols=[1])

# save as array
df_labels = df_labels['target'].values

# save data into new dataframe 'training_data'
df_training_data_raw = df_train.drop(columns=[0,1])

# split dataset into train (80%) and test data (20%); random_state=1 makes it reproducible (could be any number); stratify ensures that the proportion stays in test and training data sets
data_train, data_test, labels_train, labels_test = train_test_split(df_training_data_raw, df_labels, test_size=0.2, random_state=1, shuffle=True, stratify=df_labels)

# Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

In [19]:
# create a KNN model and use it with GridSearchCV
decision_tree_model = DecisionTreeClassifier(max_depth=50)
decision_tree_model.fit(data_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## use gridsearchcv

In [22]:
# create a model and use it with GridSearchCV
decision_tree_model = DecisionTreeClassifier()

# create a dictionary of all values we want to test
param_grid = {'max_depth': np.arange(1, 50)}

# use gridsearch to test all values
optimal_model = GridSearchCV(decision_tree_model, param_grid, cv=5)

## fit model on training data

In [23]:
# fit model to data
optimal_model.fit(data_train, labels_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## check results

In [24]:
# check top performing value for n
optimal_model.best_params_

{'max_depth': 36}

In [25]:
# check mean score for the top performing value of n
optimal_model.best_score_

0.8068242521367521

## see how model performs on test data

In [26]:
# predict the labels for the test slice
labels_pred = optimal_model.predict(data_test)
print(labels_pred[:100])

['H' 'F' 'J' 'J' 'C' 'J' 'B' 'D' 'G' 'J' 'J' 'C' 'G' 'C' 'G' 'C' 'H' 'E'
 'D' 'G' 'I' 'H' 'D' 'E' 'E' 'B' 'H' 'B' 'E' 'H' 'H' 'B' 'J' 'D' 'I' 'F'
 'J' 'B' 'A' 'A' 'E' 'H' 'I' 'I' 'I' 'D' 'A' 'H' 'E' 'A' 'I' 'B' 'D' 'J'
 'B' 'D' 'B' 'D' 'B' 'G' 'H' 'G' 'A' 'A' 'B' 'C' 'E' 'I' 'B' 'J' 'H' 'A'
 'I' 'I' 'A' 'H' 'F' 'J' 'D' 'F' 'G' 'B' 'G' 'G' 'I' 'E' 'F' 'B' 'B' 'H'
 'A' 'E' 'F' 'D' 'A' 'H' 'J' 'I' 'J' 'F']


In [27]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.8125
Precision: [0.8245614  0.8057971  0.84453228 0.79923274 0.79004038 0.8342246
 0.80810811 0.80559254 0.79434447 0.81878307]
Recall: [0.81575434 0.7423231  0.85695187 0.83444593 0.78371162 0.83311081
 0.79839786 0.80774366 0.82510013 0.82754011]


In [28]:
df_test=pd.read_csv(test_data_csv, sep=',', header=None)
test_data_new = df_test.drop(columns=[0])
# predict values for new data
predicted = optimal_model.predict(test_data_new)
print(predicted[0:100])

['H' 'B' 'A' 'B' 'B' 'D' 'I' 'B' 'A' 'I' 'A' 'A' 'A' 'G' 'A' 'A' 'A' 'E'
 'A' 'A' 'A' 'I' 'A' 'I' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'I' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'D' 'J' 'H' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'J' 'C' 'I' 'A' 'A' 'A' 'A' 'A' 'J'
 'A' 'A' 'B' 'A' 'H' 'A' 'A' 'A' 'A' 'A']


In [29]:
df_submission = pd.DataFrame(columns=['id','target'])
predicted = pd.Series(predicted)
df_submission.head()

Unnamed: 0,id,target


In [30]:
# add calculated target values to csv and format for submission
df_submission['target'] = predicted
df_submission['id'] = df_submission.index
submission_file = projectpath + 'submission.csv'
df_submission.head()

Unnamed: 0,id,target
0,0,H
1,1,B
2,2,A
3,3,B
4,4,B


In [31]:
df_submission.to_csv(submission_file, index=False, columns=['id','target'])