In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import export_text

This example uses the [Universal Bank](https://www.kaggle.com/sriharipramod/bank-loan-classification) data set and some example code of running classification trees from chapter 9 of [Data Mining for Business Analytics](https://www.dataminingbook.com/book/python-edition)

> The data include customer demographic information (age, income, etc.), the customer's relationship with the bank (mortgage, securities account, etc.), and the customer response to the last personal loan campaign (Personal Loan). Among these 5000 customers, only 480 (= 9.6%) accepted the personal loan that was offered to them in the earlier campaign
[Source](https://www.kaggle.com/itsmesunil/campaign-for-selling-personal-loans)

1. Train a decision tree classifier, print the tree and evaluate its accuracy.
2. Prune the tree by changing its hyper parameters, evaluate the accuracy of the new tree.
3. Using [grid search](https://scikit-learn.org/stable/modules/grid_search.html), perform a systematic tuning of the decision tree hyper parameters.

In [2]:
data = pd.read_csv('data/UniversalBank.csv')
data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [3]:
bank_df = data.drop(columns=['ID', 'ZIP Code'])

X = bank_df.drop(columns=['Personal Loan'])
y = bank_df['Personal Loan']
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

dtree = DecisionTreeClassifier()
dtree.fit(train_X, train_y)

print(export_text(dtree, feature_names=list(X.columns)))

|--- Income <= 110.50
|   |--- CCAvg <= 2.95
|   |   |--- Income <= 106.50
|   |   |   |--- class: 0
|   |   |--- Income >  106.50
|   |   |   |--- Family <= 3.50
|   |   |   |   |--- class: 0
|   |   |   |--- Family >  3.50
|   |   |   |   |--- Age <= 38.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Age >  38.00
|   |   |   |   |   |--- class: 1
|   |--- CCAvg >  2.95
|   |   |--- CD Account <= 0.50
|   |   |   |--- Income <= 92.50
|   |   |   |   |--- Age <= 26.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- Age >  26.50
|   |   |   |   |   |--- CCAvg <= 3.55
|   |   |   |   |   |   |--- CCAvg <= 3.35
|   |   |   |   |   |   |   |--- CCAvg <= 3.05
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- CCAvg >  3.05
|   |   |   |   |   |   |   |   |--- CCAvg <= 3.15
|   |   |   |   |   |   |   |   |   |--- Mortgage <= 89.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- Mortgage >  89.00
|   

In [4]:
print(confusion_matrix(train_y, dtree.predict(train_X)))
print(confusion_matrix(valid_y, dtree.predict(valid_X)))
accuracy_score(train_y, dtree.predict(train_X)), accuracy_score(valid_y, dtree.predict(valid_X))

[[2713    0]
 [   0  287]]
[[1794   13]
 [  25  168]]


(1.0, 0.981)

In [5]:
dtree = DecisionTreeClassifier(max_depth=30, min_samples_split=20, min_impurity_decrease=0.01)
dtree.fit(train_X, train_y)
print(export_text(dtree, feature_names=list(X.columns)))

|--- Income <= 110.50
|   |--- class: 0
|--- Income >  110.50
|   |--- Education <= 1.50
|   |   |--- Family <= 2.50
|   |   |   |--- class: 0
|   |   |--- Family >  2.50
|   |   |   |--- class: 1
|   |--- Education >  1.50
|   |   |--- Income <= 116.50
|   |   |   |--- class: 0
|   |   |--- Income >  116.50
|   |   |   |--- class: 1



In [6]:
print(confusion_matrix(train_y, dtree.predict(train_X)))
print(confusion_matrix(valid_y, dtree.predict(valid_X)))
accuracy_score(train_y, dtree.predict(train_X)), accuracy_score(valid_y, dtree.predict(valid_X))

[[2711    2]
 [  51  236]]
[[1804    3]
 [  43  150]]


(0.9823333333333333, 0.977)

In [7]:
# Start with an initial guess for parameters
param_grid = {
    'max_depth': [10, 20, 30, 40], 
    'min_samples_split': [20, 40, 60, 80, 100], 
    'min_impurity_decrease': [0, 0.0005, 0.001, 0.005, 0.01], 
}
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Score: ', gridSearch.best_score_)
print('Parameters: ', gridSearch.best_params_)



dtree = gridSearch.best_estimator_

Score:  0.988
Parameters:  {'max_depth': 10, 'min_impurity_decrease': 0.001, 'min_samples_split': 20}


In [8]:
print(confusion_matrix(train_y, dtree.predict(train_X)))
print(confusion_matrix(valid_y, dtree.predict(valid_X)))
accuracy_score(train_y, dtree.predict(train_X)), accuracy_score(valid_y, dtree.predict(valid_X))

[[2703   10]
 [  20  267]]
[[1793   14]
 [  21  172]]


(0.99, 0.9825)

In [9]:
print(export_text(dtree, feature_names=list(X.columns)))

|--- Income <= 110.50
|   |--- CCAvg <= 2.95
|   |   |--- class: 0
|   |--- CCAvg >  2.95
|   |   |--- CD Account <= 0.50
|   |   |   |--- Income <= 92.50
|   |   |   |   |--- class: 0
|   |   |   |--- Income >  92.50
|   |   |   |   |--- Education <= 1.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Education >  1.50
|   |   |   |   |   |--- class: 1
|   |   |--- CD Account >  0.50
|   |   |   |--- class: 1
|--- Income >  110.50
|   |--- Education <= 1.50
|   |   |--- Family <= 2.50
|   |   |   |--- class: 0
|   |   |--- Family >  2.50
|   |   |   |--- class: 1
|   |--- Education >  1.50
|   |   |--- Income <= 116.50
|   |   |   |--- CCAvg <= 3.50
|   |   |   |   |--- class: 0
|   |   |   |--- CCAvg >  3.50
|   |   |   |   |--- class: 1
|   |   |--- Income >  116.50
|   |   |   |--- class: 1

