In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.simplefilter("ignore")

In [3]:
df_census = pd.read_csv("Data/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/Chapter02/census_cleaned.csv")

In [4]:
X = df_census.iloc[:,:-1]
y = df_census.iloc[:,-1]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [8]:
clf = DecisionTreeClassifier(random_state=2)

In [9]:
clf.fit(X_train, y_train)

In [10]:
y_pred = clf.predict(X_test)

In [11]:
accuracy_score(y_pred, y_test)

0.8131679154894976

In [12]:
df_bikes= pd.read_csv("Data/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/Chapter01/bike_rentals_cleaned.csv")

In [13]:
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

In [15]:
reg = DecisionTreeRegressor(random_state=2)

In [16]:
scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

In [17]:
rmse = np.sqrt(-scores)

In [18]:
print("RMSE : {}".format(round(rmse.mean(),2)))

RMSE : 1233.36


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

In [28]:
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

In [29]:
from sklearn.metrics import mean_squared_error

In [30]:
reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = reg_mse**0.5
reg_rmse

0.0

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
params  = {'max_depth':[None, 2, 3, 4, 6, 8, 10, 20]}

In [33]:
reg = DecisionTreeRegressor(random_state=2)
grid_reg = GridSearchCV(reg, params, scoring="neg_mean_squared_error", cv= 5 , n_jobs=-1)

In [34]:
grid_reg.fit(X_train, y_train)

In [35]:
grid_reg.best_params_

{'max_depth': 6}

In [36]:
np.sqrt(-grid_reg.best_score_)

951.3984508554636

In [37]:
best_model = grid_reg.best_estimator_

In [38]:
y_pred = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred)**0.5
rmse_test

864.6696975011638

In [43]:
def grid_search(params, reg = DecisionTreeRegressor(random_state=2)):
    grid_reg = GridSearchCV(reg, params, scoring="neg_mean_squared_error", cv= 5 , n_jobs=-1)
    grid_reg.fit(X_train, y_train)
    best_params = grid_reg.best_params_
    print("Best Params : ", best_params)
    print("Training Score : ", (-grid_reg.best_score_)**0.5)
    y_pred = grid_reg.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred)**0.5
    print("Test Score : {}".format(round(rmse_test,3)))

In [44]:
grid_search(params={"min_samples_leaf":[1,2,3,6,8,10,20,30]})

Best Params :  {'min_samples_leaf': 8}
Training Score :  896.0830084423535
Test Score : 855.62


In [45]:
grid_search(params={"min_samples_leaf":[1,2,3,6,8,10,20,30],
                   'max_depth':[None, 2, 3, 4, 6, 8, 10, 20]})

Best Params :  {'max_depth': 6, 'min_samples_leaf': 2}
Training Score :  870.3962060281716
Test Score : 913.0


In [46]:
grid_search(params={"min_samples_leaf":[3,5,7,9],
                   'max_depth':[6,7,8,9,10]})

Best Params :  {'max_depth': 9, 'min_samples_leaf': 7}
Training Score :  888.9047575735065
Test Score : 878.538


In [47]:
df_heart = pd.read_csv("Data/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/Chapter02/heart_disease.csv")

In [48]:
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [49]:
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=2)

In [51]:
model = DecisionTreeClassifier(random_state=2)

In [52]:
scores = cross_val_score(model, X, y, cv= 5 )

In [53]:
print("Accuracy : ",np.round(scores, 2))

Accuracy :  [0.74 0.85 0.77 0.73 0.7 ]


In [54]:
print("Mean Accuracy : {}".format(round(scores.mean(),2)))

Mean Accuracy : 0.76


In [55]:
from sklearn.model_selection import RandomizedSearchCV

In [61]:
def randomized_search_clf(params, runs = 20, clf = DecisionTreeClassifier(random_state=2)):
    rand_clf = RandomizedSearchCV(clf, params, n_iter= runs, cv=5, n_jobs=-1, random_state=2)
    rand_clf.fit(X_train, y_train)
    best_model = rand_clf.best_estimator_
    best_score = rand_clf.best_score_
    print("Training Score : {}".format(round(best_score,3)))
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Score : {}".format(round(accuracy,3)))
    return best_model

In [62]:
randomized_search_clf(params={'criterion':['entropy','gini'],
                             'splitter':['random','best'],
                             'min_samples_leaf':[2, 3, 4, 5, 6, 8, 10],
                             'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
                             'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                             'max_features':['auto', 0.95, 0.9, 0.85, 0.8, 0.75, 0.7],
                             'max_depth': [None, 2 , 4, 6, 8],
                             'min_weight_fraction_leaf': [0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]})

Training Score : 0.766
Test Score : 0.829


In [63]:
randomized_search_clf(params={'min_samples_split':[2, 9, 10],
                             'max_leaf_nodes':[45, None],
                             'max_features':['auto', 0.78],
                             'max_depth': [None, 6,7],
                             'min_weight_fraction_leaf': [0.0, 0.005, 0.06, 0.07],
                             'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05]}, runs=100)

Training Score : 0.802
Test Score : 0.868


In [64]:
model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2, min_impurity_decrease=0.0,splitter="best")

In [65]:
scores = cross_val_score(model, X, y, cv= 5 )

In [66]:
print("Accuracy : ",np.round(scores, 2))

Accuracy :  [0.82 0.9  0.8  0.8  0.78]


In [67]:
print("Mean Accuracy : ",round(scores.mean(),2))

Mean Accuracy :  0.82


In [68]:
best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9, max_features=0.8, max_leaf_nodes=47,
                       min_samples_leaf=1, min_samples_split=8,
                       min_weight_fraction_leaf=0.05, random_state=2, min_impurity_decrease=0.0,splitter="best")

In [69]:
best_clf.fit(X_train, y_train)

In [70]:
best_clf.feature_importances_

array([0.08318454, 0.08653833, 0.41495275, 0.        , 0.0544383 ,
       0.        , 0.0441642 , 0.01893451, 0.        , 0.00676148,
       0.        , 0.17984376, 0.11118214])

In [71]:
feature_dict = dict(zip(X_train.columns, best_clf.feature_importances_))

In [72]:
import operator

In [73]:
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.4149527488526222),
 ('ca', 0.17984376285950324),
 ('thal', 0.11118214082998887)]