In [None]:
# import data set
import pandas as pd
#data = pd.read_csv('C:/Users/D94945/Jupyter/PyCaret Demo/HeartDisease.csv')
data = pd.read_csv('C:/Users/D94945/Downloads/Jupyter/Trees/bank.csv')
data.head()

In [None]:
#Select Target and explanatory variables
y = data["Y"]
X = data.loc[:, data.columns !='Y']

In [None]:
#generate dummies
dum_job = pd.get_dummies(X.job, prefix="job", prefix_sep="_")
dum_marital = pd.get_dummies(X.marital, prefix="marital", prefix_sep="_")
dum_education = pd.get_dummies(X.education, prefix="education", prefix_sep="_")
dum_contact = pd.get_dummies(X.contact, prefix="contact", prefix_sep="_")
dum_month = pd.get_dummies(X.month, prefix="month", prefix_sep="_")
dum_day = pd.get_dummies(X.day_of_week, prefix="day", prefix_sep="_")

X = pd.concat([X, dum_job, dum_marital, dum_education, dum_contact, dum_month, dum_day], axis=1)\
    .drop(columns=["job", "marital", "education", "contact", "day_of_week", "month"])

**Decision Trees**

In [None]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
# Import train_test_split
from sklearn.model_selection import train_test_split
# Import accuracy_score
from sklearn.metrics import accuracy_score

# Set seed for reproducibility
SEED = 1

# Split dataset into 70% train, 30% test
X_train, X_test, y_train, y_test= train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=1)
feature_names = X_train.columns
# Instantiate dt
dt = DecisionTreeClassifier(max_depth=10, random_state=SEED)

In [None]:
# Fit dt to the training set
dt.fit(X_train,y_train)
# Predict test set labels
y_pred = dt.predict(X_test)
# Evaluate test-set accuracy
accuracy_dt = accuracy_score(y_test, y_pred)
print('Accuracy of Decision Tree: {:.3f}'.format(accuracy_dt))

In [None]:
#Graph resulting tree
from matplotlib import pyplot as plt
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,
                   feature_names=feature_names,
                   filled=True)
fig.savefig("decision_tree.png")

In [None]:
# Instantiate dt, set 'criterion' to 'gini'
dt2 = DecisionTreeClassifier(criterion='gini', random_state=1)
# Fit dt to the training set
dt2.fit(X_train,y_train)
# Predict test-set labels
y2_pred= dt2.predict(X_test)
# Evaluate test-set accuracy
accuracy_dt2 = accuracy_score(y_test, y2_pred)
print('Accuracy of Decision Tree 2: {:.3f}'.format(accuracy_dt2))

**Decision Tree Tuned**

In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier(random_state=SEED)
# Define the grid of hyperparameters 'params_dt'
params_dt = {
            'max_depth': [3, 4,5, 6],
            'min_samples_leaf': [0.04, 0.06, 0.08],
            'max_features': [0.2, 0.4,0.6, 0.8]
            }
# Instantiate a 10-fold CV grid search object 'grid_dt'
grid_dt = GridSearchCV(estimator=dt,
param_grid=params_dt,
scoring='accuracy',
cv=10,
n_jobs=-1)
# Fit 'grid_dt' to the training data
grid_dt.fit(X_train, y_train)

In [None]:
# Extract best hyperparameters from 'grid_dt'
best_hyperparams = grid_dt.best_params_
print('Best hyerparameters:\n', best_hyperparams)

In [None]:
# Extract best CV score from 'grid_dt'
best_CV_score = grid_dt.best_score_
print('Best CV accuracy'.format(best_CV_score))

In [None]:
# Extract best model from 'grid_dt'
best_model_dt = grid_dt.best_estimator_
# Evaluate test set accuracy
test_acc_dt = best_model_dt.score(X_test,y_test)
# Print test set accuracy
print("Test set accuracy of best model: {:.3f}".format(test_acc_dt))

In [None]:
fig2 = plt.figure(figsize=(25,20))
_2 = tree.plot_tree(best_model_dt,
                   feature_names=feature_names,
                   filled=True)
fig2.savefig("decision_tree_tuned.png")

**Bagging**

In [None]:
# Import models and utility functions
from sklearn.ensemble import BaggingClassifier
# Instantiate a BaggingClassifier 'bc'
bc = BaggingClassifier(base_estimator=dt, n_estimators=300, oob_score=True, n_jobs=-1)
# Fit 'bc' to the training set
bc.fit(X_train, y_train)
# Predict test set labels
y_pred_bc = bc.predict(X_test)
# Evaluate and print test-set accuracy
accuracy_bc = accuracy_score(y_test, y_pred_bc)
print('Accuracy of Bagging Classifier: {:.3f}'.format(accuracy_bc))

In [None]:
# Extract the OOB accuracy from 'bc'
oob_accuracy = bc.oob_score_

# Print OOB accuracy
print('OOB accuracy: {:.3f}'.format(oob_accuracy))

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate a random forests classifier 'rf' 400 estimators
rf = RandomForestClassifier(n_estimators=400,
                            min_samples_leaf=0.12,
                            random_state=SEED)
# Fit 'rf' to the training set
rf.fit(X_train, y_train)
# Predict the test set labels 'y_pred'
y_pred_rf = rf.predict(X_test)

# Evaluate and print test set accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print('Accuracy of Random Forest: {:.3f}'.format(accuracy_rf))

In [None]:
#feature importance
import pandas as pd
import matplotlib.pyplot as plt
# Create a pd.Series of features importances
importances_rf = pd.Series(rf.feature_importances_, index = X.columns)
# Sort importances_rf
sorted_importances_rf = importances_rf.sort_values()
# Make a horizontal bar plot
sorted_importances_rf.nlargest(15).plot(kind='barh', color='lightgreen'); plt.show()

**Random Forest Tuning**

In [None]:
# Instantiate a random forests classifier 'rf'
rf_t = RandomForestClassifier(random_state= SEED)
# Define a grid of hyperparameter 'params_rf'
params_rf = {
            'n_estimators': [300, 400, 500],
            'max_depth': [4, 6, 8],
            'min_samples_leaf': [0.1, 0.2]
            }
# Instantiate 'grid_rf'
grid_rf = GridSearchCV(estimator=rf_t,
                        param_grid=params_rf,
                        cv=3,
                        scoring='accuracy',
                        verbose=1,
                        n_jobs=-1)

In [None]:
# Fit 'grid_rf' to the training set
grid_rf.fit(X_train, y_train)

In [None]:
# Extract best hyperparameters from 'grid_rf'
best_hyperparams = grid_rf.best_params_
print('Best hyerparameters:\n', best_hyperparams)

In [None]:
# Extract best model from 'grid_rf'
best_model = grid_rf.best_estimator_
# Predict the test set labels
y_pred_rf_t = best_model.predict(X_test)
# Evaluate and print test set accuracy
accuracy_rf_t = accuracy_score(y_test, y_pred_rf_t)
print('Accuracy of Random Forest Tuned: {:.3f}'.format(accuracy_rf_t))

**Boosting**

In [None]:
# Import models and utility functions
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

# Instantiate a classification-tree 'dt'
dt_b = DecisionTreeClassifier(max_depth=1, random_state=SEED)
# Instantiate an AdaBoost classifier 'adab_clf'
adb_clf = AdaBoostClassifier(base_estimator=dt_b, n_estimators=100)
# Fit 'adb_clf' to the training set
adb_clf.fit(X_train, y_train)
# Predict the test set probabilities of positive class
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]



# Evaluate test-set roc_auc_score
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)
# Print adb_clf_roc_auc_score
print('ROC AUC score: {:.2f}'.format(adb_clf_roc_auc_score))


**Logistic**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(X_test)
lr.score(X_test, y_test)