In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('agg_Xtrain.csv').set_index('Unnamed: 0')
X_test = pd.read_csv('agg_Xtest.csv').set_index('Unnamed: 0')
y_train = pd.read_csv('agg_ytrain.csv').set_index('Unnamed: 0')
y_test = pd.read_csv('agg_ytest.csv').set_index('Unnamed: 0')

In [5]:
#hyperparameter tuning
Kcv = KFold(n_splits=10)
params = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'], 
          'max_depth': [i for i in range(1,11)], 
          'max_leaf_nodes': [5,10,15,20,25]}
dec_tree = DecisionTreeClassifier()
grid_search_cv = GridSearchCV(dec_tree, params, cv=Kcv)

In [6]:
#best params found by grid search
grid_search_cv.fit(X_train, y_train)
grid_search_cv.best_params_

KeyboardInterrupt: 

In [None]:
%%time
dec_tree.set_params(**grid_search_cv.best_params_)
dec_tree.fit(X_train,y_train)
pred = dec_tree.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted')
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')

print(f"precision: {precision}, recall: {recall}, f1score: {f1}")

In [None]:
#plot result

matrix = confusion_matrix(y_test, pred)
matrix_normalized = confusion_matrix(y_test, pred, normalize='true')

fig1, ax = plt.subplots(figsize=(10,10))
sns.heatmap(matrix, annot=True, fmt='.2f')
plt.title('Confusion matrix without normalization')
plt.ylabel('Actual')
plt.xlabel('Predicted')

fig2, ax = plt.subplots(figsize=(10,10))
sns.heatmap(matrix_normalized, annot=True, fmt='.2f')
plt.title('Confusion matrix with normalization')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False)

In [None]:
from sklearn import tree

plt.figure(figsize=(30,15))

tree.plot_tree(dec_tree,
              feature_names=X_train.columns,
              filled=True,
              fontsize=11)
plt.show()

In [None]:
importance = dec_tree.feature_importances_

fig = plt.figure(figsize=(10,10))
plt.bar(X_test.columns, importance)
plt.xticks(rotation=40, ha = 'right')
plt.show()

In [None]:
print('training data')
print(classification_report(y_train, dec_tree.predict(X_train)))

print('test data')
print(classification_report(y_test, dec_tree.predict(X_test)))