In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
pd.set_option('display.max_columns', None)

In [2]:
tr_features = pd.read_csv('data/train_features.csv')
tr_labels = pd.read_csv('data/train_labels.csv')

te_features = pd.read_csv('data/test_features.csv')
te_labels = pd.read_csv('data/test_labels.csv')

In [3]:
def print_results(results):
    print(f'BEST PARAMS: {results.best_params_}\n')

    means = sorted(results.cv_results_['mean_test_score'], reverse=True)
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{round(mean,3)} (+/- {round(std * 2, 3)}) for {params}')

In [5]:
dt = DecisionTreeClassifier()
dtscores = cross_val_score(dt, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(dtscores)

dt.fit(tr_features, tr_labels.values.ravel())

[0.30096169 0.31420464 0.30900205 0.31152452 0.30269589]


In [16]:
dtparams = {
    'criterion': ['gini','entropy','log_loss'],
    'splitter': ['best','random'],
    'max_depth': [5,50,100,None],
    'max_leaf_nodes': [5,50,500,None],
    'min_samples_split': [1,2,3],
    'min_samples_leaf': [1,2,3],
    'max_features': [None,'sqrt','log2']
}

cv = GridSearchCV(dt, dtparams, cv=5, n_jobs=-1)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'criterion': 'entropy', 'max_depth': 50, 'max_features': None, 'max_leaf_nodes': 50, 'min_samples_leaf': 1, 'min_samples_split': 1, 'splitter': 'best'}

0.367 (+/- 0.009) for {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 1, 'splitter': 'best'}
0.367 (+/- 0.006) for {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 1, 'splitter': 'random'}
0.367 (+/- 0.009) for {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
0.367 (+/- 0.013) for {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
0.367 (+/- 0.009) for {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 3,

In [6]:
# best params
dt0 = DecisionTreeClassifier(criterion='entropy', max_depth=50, max_features=None, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=1, splitter='best')
dt0scores = cross_val_score(dt0, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(dt0scores)
dt0.fit(tr_features, tr_labels.values.ravel())

dt1 = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=5, min_samples_split=1)
dt1scores = cross_val_score(dt1, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(dt1scores)
dt1.fit(tr_features, tr_labels.values.ravel())

# dt2 = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=5)
# dt2scores = cross_val_score(dt2, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
# print(dt2scores)
# dt2.fit(tr_features, tr_labels.values.ravel())

dt2 = DecisionTreeClassifier(max_depth=None, max_leaf_nodes=50)
dt2scores = cross_val_score(dt2, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(dt2scores)
dt2.fit(tr_features, tr_labels.values.ravel())

for mdl in [dt, dt0, dt1, dt2]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 8)
    precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
    recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
    f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
    print(f'MAX DEPTH: {mdl.max_depth} / MAX LEAF NODES: {mdl.max_leaf_nodes} / A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

[0.36528457 0.37364023 0.372379   0.36418099 0.35708655]
[0.34337064 0.35409112 0.34794261 0.34794261 0.34195176]
[0.35945136 0.37159073 0.36039729 0.36355037 0.36686111]
MAX DEPTH: None / MAX LEAF NODES: None / A: 0.31012738 / P: 0.31088201 / R: 0.31012738 / F1: 0.31044495
MAX DEPTH: 50 / MAX LEAF NODES: 50 / A: 0.38289822 / P: 0.36930244 / R: 0.38289822 / F1: 0.35612188
MAX DEPTH: 5 / MAX LEAF NODES: 5 / A: 0.35187287 / P: 0.25495474 / R: 0.35187287 / F1: 0.28493377
MAX DEPTH: None / MAX LEAF NODES: 50 / A: 0.3723042 / P: 0.36190958 / R: 0.3723042 / F1: 0.36016537


  _warn_prf(average, modifier, msg_start, len(result))
