# Decision Trees in Depth

#### Loading Libraries

In [118]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Model Metrics
from sklearn.metrics import mean_squared_error, accuracy_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor
from xgboost import XGBClassifier

# Warnings
import warnings

## Exploring Decision Trees

#### First Decision Tree Model - Loading Data

In [20]:
df_census = pd.read_csv('census_cleaned.csv')
df_census.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Declaring Predictor and target columns
X = df_census.iloc[:, :-1]
y = df_census.iloc[:,-1]

In [22]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

#### Building a Tree Model

In [23]:
clf = DecisionTreeClassifier(random_state=2)

In [24]:
# Fitting the model
clf.fit(X_train, y_train)

In [25]:
# Making Predictions
y_pred = clf.predict(X_test)

In [26]:
accuracy_score(y_pred, y_test)

0.8131679154894976

### Tuning Decision Tree Hyper-parameters

#### Decision Tree Regressor

In [76]:
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')
df_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [77]:
X_bikes = df_bikes.iloc[:, :-1] 
y_bikes = df_bikes.iloc[:, -1]

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

In [79]:
# Initialize the Tree
reg = DecisionTreeRegressor(random_state=2)

In [80]:
# Setting scoring procedure
scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

In [81]:
# Computing root mean squared error
rmse = np.sqrt(-scores)
print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1233.36


In [82]:
# Checking errors on Training set
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

In [83]:
reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)

In [84]:
reg_rmse

0.0

### Hyperparameter in General

In [85]:
params = {'max_depth':[None, 2, 3, 4, 6, 8, 10, 20]}

In [86]:
# Regressor Initialization
reg = DecisionTreeRegressor(random_state=2)

In [87]:
# Setting Grid Search
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

In [88]:
# Fitting the model
grid_reg.fit(X_train, y_train)

In [89]:
best_params = grid_reg.best_params_
print("Best Params: ", best_params)

Best Params:  {'max_depth': 6}


In [90]:
# Checking Training score
best_score = np.sqrt(-grid_reg.best_score_)
print("Training score: {:.3f}".format(best_score))

Training score: 951.398


In [91]:
best_model = grid_reg.best_estimator_
y_pred = best_model.predict(X_test)

In [92]:
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test Score: {:.3f}'.format(rmse_test))

Test Score: 864.670


#### min_samples_leaf

In [93]:
# Best Parameter Function
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    grid_reg.fit(X_train, y_train)
    # On Params
    best_params = grid_reg.best_params_
    print("Best params: ", best_params)
    # On Scores
    best_score - np.sqrt(-grid_reg.best_score_)
    print("Training Score: {:.3f}".format(best_score))
    # Adjusting Prediction
    y_pred = grid_reg.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred)**0.5
    print('Test Score: {:.3f}'.format(rmse_test))

In [94]:
X_train.shape

(548, 12)

In [95]:
grid_search(params={'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30]})

Best params:  {'min_samples_leaf': 8}
Training Score: 951.398
Test Score: 855.620


In [96]:
grid_search(params={'max_depth': [None, 2, 3, 4, 6, 8, 10, 20], 'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30]})

Best params:  {'max_depth': 6, 'min_samples_leaf': 2}
Training Score: 951.398
Test Score: 913.000


In [97]:
# Re-Evaluating
grid_search(params={'max_depth': [6, 7, 8, 9, 10], 'min_samples_leaf': [3, 6, 7, 8, 9]})

Best params:  {'max_depth': 9, 'min_samples_leaf': 7}
Training Score: 951.398
Test Score: 878.538


## Predicting Heart Disease

#### Loading Data

In [148]:
df_heart = pd.read_csv('heart_disease.csv')
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [149]:
# Splitting Data Procedure
X = df_heart.iloc[:, :-1]
y = df_heart.iloc[:, -1]

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

#### Decision Tree Classifier

In [151]:
# Baseline Model
model = DecisionTreeClassifier(random_state=2)

In [152]:
scores = cross_val_score(model, X, y, cv=5)

In [153]:
print('Accuracy: ', np.round(scores, 2))
print('Accuracy mean: %.02f' % scores.mean())

Accuracy:  [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


In [154]:
# Setting RandomizedSearch CLF Function
def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=-1, random_state=2)
    rand_clf.fit(X_train, y_train)
    # About Model
    best_model = rand_clf.best_estimator_
    best_score = rand_clf.best_score_
    print("Training score: {:.3f}".format(best_score))
    # About Prediction
    y_pred = best_model.predict(X_test)
    # About Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print('Test score: {:.3f}'.format(accuracy))
    return best_model

#### Choosing Hyperparameters

In [155]:
# # It's reporting some warnings. Therefore, I've decided to adjust it 
# randomized_search_clf(params={'criterion': ['entropy', 'gini'],
#                               'splitter': ['random', 'best'], 
#                               'min_weight_fraction_leaf': [0.0, 0.0025, 0.005, 0.0075, 0.01],
#                               'min_samples_split': [2, 3, 4, 5, 6, 8, 10],
#                               'min_impurity_decrease': [0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
#                               'max_leaf_nodes': [10, 15, 20, 25, 30, 35, 40, 45, 50, None],
#                               'max_features': ['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
#                               'max_depth': [None, 2, 4, 6, 8],
#                               'min_weight_fraction_leaf': [0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]})

In [156]:
# Adjusted Snippet
randomized_search_clf(params={
    'criterion': ['entropy', 'gini'],
    'splitter': ['random', 'best'],
    'min_weight_fraction_leaf': [0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05],
    'min_samples_split': [2, 3, 4, 5, 6, 8, 10],
    'min_impurity_decrease': [0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
    'max_leaf_nodes': [10, 15, 20, 25, 30, 35, 40, 45, 50, None],
    'max_features': [None, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
    'max_depth': [None, 2, 4, 6, 8]
})

Training score: 0.793
Test score: 0.803


In [157]:
DecisionTreeClassifier(class_weight=None, 
                       criterion='entropy',
                       max_depth=8, 
                       max_features=0.8, 
                       max_leaf_nodes=45,
                       min_impurity_decrease=0.0, 
                       #min_impurity_split=None, 
                       min_samples_leaf=0.04, 
                       min_samples_split=10, 
                       min_weight_fraction_leaf=0.05, 
                       #presort=False, 
                       random_state=2, 
                       splitter='best')

#### Narrowing The Range

In [158]:
randomized_search_clf(params={'max_depth': [None, 6, 7],
                              'max_features': [0.78],
                              'max_leaf_nodes': [45, None],
                              'min_samples_leaf': [1, 0.035, 0.04, 0.045, 0.05],
                              'min_samples_split': [2, 9, 10],
                              'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],
                             }, runs=100)

Training score: 0.802
Test score: 0.868


In [159]:
DecisionTreeClassifier(class_weight=None,
                       criterion='gini',
                       max_depth=7, 
                       max_features=0.78,
                       max_leaf_nodes=45,
                       min_impurity_decrease=0.0, 
                       #min_impurity_split=None,
                       min_samples_leaf=0.045, 
                       min_samples_split=9,
                       min_weight_fraction_leaf=0.06,
                       #presort=False, 
                       random_state=2, 
                       splitter='best')

In [163]:
# Putting model into Cross-Validation
DecisionTreeClassifier(class_weight=None,
                       criterion='gini',
                       max_depth=7, 
                       max_features=0.78,
                       max_leaf_nodes=45,
                       min_impurity_decrease=0.0, 
                       #min_impurity_split=None,
                       min_samples_leaf=0.045, 
                       min_samples_split=9,
                       min_weight_fraction_leaf=0.06,
                       #presort=False, 
                       random_state=2, 
                       splitter='best')

In [164]:
scores = cross_val_score(model, X, y, cv=5)
print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


#### feature_importances_

In [165]:
best_clf = DecisionTreeClassifier(class_weight=None,
                       criterion='gini',
                       max_depth=9, 
                       max_features=0.8,
                       max_leaf_nodes=47,
                       min_impurity_decrease=0.0, 
                       #min_impurity_split=None,
                       min_samples_leaf=1, 
                       min_samples_split=8,
                       min_weight_fraction_leaf=0.05,
                       #presort=False, 
                       random_state=2, 
                       splitter='best')

In [166]:
best_clf.fit(X, y)

In [167]:
best_clf.feature_importances_

array([0.04830121, 0.04008887, 0.47546568, 0.        , 0.        ,
       0.        , 0.        , 0.00976578, 0.        , 0.02445397,
       0.02316427, 0.1774694 , 0.20129082])

In [169]:
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))

In [170]:
import operator

In [171]:
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True) [0:3]

[('cp', 0.47546567857183675),
 ('thal', 0.20129082387838435),
 ('ca', 0.1774694042213901)]