In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import BaggingClassifier

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv("data_cleaned.csv")
df.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [18]:
X = df.drop("Survived", axis = 1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.4)
print("X train and test sizes: {}, {}".format(X_train.shape, X_test.shape))
print("y train and test sizes: {}, {}".format(y_train.shape, y_test.shape))

X train and test sizes: (534, 24), (357, 24)
y train and test sizes: (534,), (357,)


## Basic Decision Tree

In [19]:

tree1 = DecisionTreeClassifier(random_state=42)
tree1.fit(X_train, y_train)


In [20]:
#accuracy scoring function
def score(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
    print("train_score: ",model.score(X_train, y_train),end="\n")
    print("test_score: ",model.score(X_test, y_test))

In [21]:
score(tree1)

train_score:  0.9831460674157303
test_score:  0.7142857142857143


### Bagging classifier with Decision Tree using Stratified Kfold

In [22]:
X_array = df.drop("Survived", axis = 1).values
y_array = df["Survived"].values

In [23]:
bg = BaggingClassifier(n_estimators=10, 
                       base_estimator=tree1, 
                       random_state=42,
                       bootstrap=True)
cv_scores = cross_val_score(estimator=bg,
                            X=X_array,
                            y=y_array,
                            cv=5)
print("Bagging with Cross-Validation scores:", cv_scores)
print("Average CV score:", np.mean(cv_scores))

Bagging with Cross-Validation scores: [0.7877095  0.8258427  0.85955056 0.78651685 0.83146067]
Average CV score: 0.8182160567447117


### Score of Bagging with startified k fold , k = 5

In [None]:
bg.fit(X_train,y_train)
score(model = bg)

### Cross validation without stratify

In [24]:
cv_scores = cross_val_score(estimator=tree1,X=X_array,y=y_array,cv=5)
print("Average CV score:", np.mean(cv_scores))

Average CV score: 0.7777854497520558


### Hyper parameter tuning the model with bagging

In [26]:
params = {'base_estimator__max_depth':[2,3,4,5,6,7,8],
         'base_estimator__min_samples_split':range(2,20,2),
         'base_estimator__min_samples_leaf':range(2,5),
         'base_estimator__max_leaf_nodes':range(10,30,3)}
dt = DecisionTreeClassifier(random_state = 42)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator= bg, param_grid = params,cv = stratified_kfold,scoring="accuracy")
grid_search.fit(X_train,y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.8332921883265738
{'base_estimator__max_depth': 5, 'base_estimator__max_leaf_nodes': 10, 'base_estimator__min_samples_leaf': 4, 'base_estimator__min_samples_split': 10}


In [27]:
#'base_estimator__max_depth': 5, 
# 'base_estimator__max_leaf_nodes': 10, 
# 'base_estimator__min_samples_leaf': 4, 
# 'base_estimator__min_samples_split': 10
dt_tuned = DecisionTreeClassifier(max_depth=5,
                       min_samples_leaf=4,
                       min_samples_split=10,
                       max_leaf_nodes=10)
dt_tuned.fit(X_train, y_train)
score(model = dt_tuned)

train_score:  0.850187265917603
test_score:  0.7871148459383753
