# DT vs Bagged DT vs Random Forests vs Bagged NB vs Bagged KNN

In [149]:
# All imports
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

---
## Loading Dataset:

### Pima Diabetes Dataset:

In [150]:
# df=pd.read_csv('Datasets/pima-indians-diabetes.csv', header=None)

In [151]:
# df.columns = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']

In [152]:
# features = df.drop('outcome', axis=1)
# outcome = df['outcome']

---

### Parkinson's Dataset:

In [153]:
parkinson = pd.read_csv('Datasets/parkinson.csv')
parkinson.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [154]:
features = parkinson.drop(['name', 'status'], axis=1)
outcome = parkinson['status']

---
### Splitting Dataset:

In [155]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcome, test_size=0.3, random_state=2)

---
## Comparing Performace of fully grown DT, Bagged DT, Random Forests, Bagged NB and Bagged KNN:

### Fully Grown DT:

In [156]:
dt = DecisionTreeClassifier(random_state=2).fit(X_train, y_train)

### Bagged DT:

In [157]:
def fit_dt_bagged(X, y):
    model = BaggingClassifier(base_estimator=dt, random_state=2)
    params = {'n_estimators':list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_dt = fit_dt_bagged(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_dt.get_params())

{'base_estimator__class_weight': None, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': None, 'base_estimator__max_features': None, 'base_estimator__max_leaf_nodes': None, 'base_estimator__min_impurity_decrease': 0.0, 'base_estimator__min_impurity_split': None, 'base_estimator__min_samples_leaf': 1, 'base_estimator__min_samples_split': 2, 'base_estimator__min_weight_fraction_leaf': 0.0, 'base_estimator__presort': False, 'base_estimator__random_state': 2, 'base_estimator__splitter': 'best', 'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=2,
            splitter='best'), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 91, 'n_jobs': Non



In [158]:
bagged_dt = BaggingClassifier(base_estimator=dt, random_state=2, n_estimators=91).fit(X_train, y_train)

### Random Forests:

In [159]:
def fit_rf(X, y):
    model = RandomForestClassifier(random_state=2)
    params = {'n_estimators':list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_rf = fit_rf(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_rf.get_params())

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 91, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}




In [160]:
rf = RandomForestClassifier(random_state=2, n_estimators=91).fit(X_train, y_train)

### Naive Bayes:

In [162]:
nb = GaussianNB().fit(X_train, y_train)

### Bagged NB:

In [163]:
def fit_nb_bagged(X, y):
    model = BaggingClassifier(base_estimator=nb, random_state=2)
    params = {'n_estimators': list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_nb_bagged = fit_nb_bagged(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_nb_bagged.get_params())

{'base_estimator__priors': None, 'base_estimator__var_smoothing': 1e-09, 'base_estimator': GaussianNB(priors=None, var_smoothing=1e-09), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 46, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}




In [164]:
bagged_nb = BaggingClassifier(base_estimator=nb, random_state=2, n_estimators=47).fit(X_train, y_train)

### KNN:

In [166]:
knn = KNeighborsClassifier(n_neighbors=15).fit(X_train, y_train)

### Bagged KNN:

In [169]:
def fit_knn_bagged(X, y):
    model = BaggingClassifier(base_estimator=knn, random_state=2)
    params = {'n_estimators': list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_knn_bagged = fit_knn_bagged(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_knn_bagged.get_params())

{'base_estimator__algorithm': 'auto', 'base_estimator__leaf_size': 30, 'base_estimator__metric': 'minkowski', 'base_estimator__metric_params': None, 'base_estimator__n_jobs': None, 'base_estimator__n_neighbors': 15, 'base_estimator__p': 2, 'base_estimator__weights': 'uniform', 'base_estimator': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=15, p=2,
           weights='uniform'), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 1, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}




In [168]:
bagged_knn = BaggingClassifier(base_estimator=knn, random_state=2, n_estimators=15).fit(X_train, y_train)

### Comparing the models using cross validation:

In [174]:
from sklearn.model_selection import KFold
accuracy = []
for index, model in enumerate([dt, bagged_dt, rf, bagged_nb, bagged_knn]): 
    print(f'Model {index+1}:')
#     kf = KFold(n_splits=10, shuffle=True, random_state=2)
    scores = cross_val_score(estimator=model,
                         X=features,
                         y=outcome,
                         cv=5,
                         n_jobs=1, scoring='recall')
    print(f'CV accuracy: {scores}')
    print(f'CV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
    print(f'Bias Error: {(np.mean(1 - scores))}')
    print(f'Variance Error: {np.var(scores)}')
    print()
    accuracy.append(scores)

Model 1:
CV accuracy: [0.86666667 0.9        0.82758621 0.65517241 0.93103448]
CV accuracy: 0.836 +/- 0.097
Bias Error: 0.1639080459770115
Variance Error: 0.00936747258554631

Model 2:
CV accuracy: [0.93333333 0.9        0.86206897 0.89655172 1.        ]
CV accuracy: 0.918 +/- 0.047
Bias Error: 0.08160919540229886
Variance Error: 0.002174131325142028

Model 3:
CV accuracy: [0.9        0.96666667 0.89655172 0.93103448 1.        ]
CV accuracy: 0.939 +/- 0.040
Bias Error: 0.06114942528735632
Variance Error: 0.0015745276786893908

Model 4:
CV accuracy: [0.56666667 0.7        0.75862069 0.68965517 0.68965517]
CV accuracy: 0.681 +/- 0.063
Bias Error: 0.31908045977011495
Variance Error: 0.003921574844761527

Model 5:
CV accuracy: [1.         1.         1.         0.72413793 1.        ]
CV accuracy: 0.945 +/- 0.110
Bias Error: 0.05517241379310345
Variance Error: 0.012175980975029727



#### We can see that using bagged and random forest implementation the variance error reduces with a penalty increase on bias error!
---

In [175]:
least_bias_err = 1
least_variance_err = 1
for index, acc in enumerate(accuracy):
    if ((np.mean(1 - acc)) < least_bias_err):
        least_bias_err = (np.mean(1 - acc))
        index_bias = index
    if (np.var(acc) < least_variance_err):
        least_variance_err = np.var(acc)
        index_var = index
print(f'Model {index_bias + 1} has least bias: {least_bias_err}')
print(f'Model {index_var + 1} has least variance: {least_variance_err}')


Model 5 has least bias: 0.05517241379310345
Model 3 has least variance: 0.0015745276786893908


### Therefore for the above data Random forests (Model 3) has the least variance and a slightly lesser bias error as compared to the best.
---