---
# Boosting Classifiers Analysis:

In [21]:
# All imports
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

---
## Loading Dataset:

### Pima Diabetes Dataset:

In [2]:
df=pd.read_csv('Datasets/pima-indians-diabetes.csv', header=None)

In [3]:
df.columns = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']

In [4]:
features = df.drop('outcome', axis=1)
outcome = df['outcome']

---

### Parkinson's Dataset:

In [5]:
# parkinson = pd.read_csv('Datasets/parkinson.csv')
# parkinson.head()

In [6]:
# features = parkinson.drop(['name', 'status'], axis=1)
# outcome = parkinson['status']

---
### Splitting Dataset:

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcome, test_size=0.3, random_state=2)

---
## Comparing Performace of fully grown DT, Bagged DT, Random Forests, Bagged NB and Bagged KNN:

### Fully Grown DT:

In [8]:
dt = DecisionTreeClassifier(random_state=2).fit(X_train, y_train)

### Adaboost DT:

In [9]:
def fit_dt_adaboost(X, y):
    model = AdaBoostClassifier(base_estimator=dt, random_state=2)
    params = {'n_estimators':list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_dt = fit_dt_adaboost(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_dt.get_params())

{'algorithm': 'SAMME.R', 'base_estimator__class_weight': None, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': None, 'base_estimator__max_features': None, 'base_estimator__max_leaf_nodes': None, 'base_estimator__min_impurity_decrease': 0.0, 'base_estimator__min_impurity_split': None, 'base_estimator__min_samples_leaf': 1, 'base_estimator__min_samples_split': 2, 'base_estimator__min_weight_fraction_leaf': 0.0, 'base_estimator__presort': False, 'base_estimator__random_state': 2, 'base_estimator__splitter': 'best', 'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=2,
            splitter='best'), 'learning_rate': 1.0, 'n_estimators': 1, 'random_state': 2}


In [25]:
adaboost_dt = AdaBoostClassifier(base_estimator=dt, random_state=2, n_estimators=51).fit(X_train, y_train)

## Gradient Boost DT:

In [11]:
def fit_gbdt(X, y):
    model = GradientBoostingClassifier(random_state=2)
    params = {'n_estimators':list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_gbdt = fit_gbdt(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_gbdt.get_params())

{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 97, 'n_iter_no_change': None, 'presort': 'auto', 'random_state': 2, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [12]:
gbdt = GradientBoostingClassifier(random_state=2, n_estimators=97 ).fit(X_train, y_train)

### Random Forests:

In [13]:
def fit_rf(X, y):
    model = RandomForestClassifier(random_state=2)
    params = {'n_estimators':list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_rf = fit_rf(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_rf.get_params())

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 51, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}


In [14]:
rf = RandomForestClassifier(random_state=2, n_estimators=51).fit(X_train, y_train)

### Boosted RF:

In [15]:
def fit_rf_boosted(X, y):
    model = AdaBoostClassifier(base_estimator=rf, random_state=2)
    params = {'n_estimators':list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_rf_boosted = fit_rf_boosted(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_rf_boosted.get_params())

{'algorithm': 'SAMME.R', 'base_estimator__bootstrap': True, 'base_estimator__class_weight': None, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': None, 'base_estimator__max_features': 'auto', 'base_estimator__max_leaf_nodes': None, 'base_estimator__min_impurity_decrease': 0.0, 'base_estimator__min_impurity_split': None, 'base_estimator__min_samples_leaf': 1, 'base_estimator__min_samples_split': 2, 'base_estimator__min_weight_fraction_leaf': 0.0, 'base_estimator__n_estimators': 51, 'base_estimator__n_jobs': None, 'base_estimator__oob_score': False, 'base_estimator__random_state': 2, 'base_estimator__verbose': 0, 'base_estimator__warm_start': False, 'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_e

In [16]:
rf_boosted = AdaBoostClassifier(base_estimator=rf, random_state=2, n_estimators=51)

### Naive Bayes:

In [17]:
nb = GaussianNB().fit(X_train, y_train)

### Boosted NB:

In [18]:
def fit_nb_boosted(X, y):
    model = AdaBoostClassifier(base_estimator=nb, random_state=2)
    params = {'n_estimators': list(np.arange(1, 101))}
    grid = GridSearchCV(model, params, scoring='recall', cv=3)
    grid_fit = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid_fit.best_estimator_

model_nb_boosted = fit_nb_boosted(X_train, y_train)

# Produce the value for 'n_neighbors'
print(model_nb_boosted.get_params())

{'algorithm': 'SAMME.R', 'base_estimator__priors': None, 'base_estimator__var_smoothing': 1e-09, 'base_estimator': GaussianNB(priors=None, var_smoothing=1e-09), 'learning_rate': 1.0, 'n_estimators': 2, 'random_state': 2}


In [22]:
boosted_nb = BaggingClassifier(base_estimator=nb, random_state=2, n_estimators=51).fit(X_train, y_train)

### Comparing the models using cross validation:

In [23]:
from sklearn.model_selection import KFold
accuracy = []
for index, model in enumerate([adaboost_dt, rf, rf_boosted, gbdt]): 
    print(f'Model {index+1}:')
#     kf = KFold(n_splits=10, shuffle=True, random_state=2)
    scores = cross_val_score(estimator=model,
                         X=features,
                         y=outcome,
                         cv=3,
                         n_jobs=1, scoring='recall')
    print(f'CV accuracy: {scores}')
    print(f'CV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
    print(f'Bias Error: {(np.mean(1 - scores))}')
    print(f'Variance Error: {np.var(scores)}')
    print()
    accuracy.append(scores)

Model 1:
CV accuracy: [0.58888889 0.49438202 0.61797753]
CV accuracy: 0.567 +/- 0.053
Bias Error: 0.43291718684977115
Variance Error: 0.00278372730438734

Model 2:
CV accuracy: [0.64444444 0.56179775 0.52808989]
CV accuracy: 0.578 +/- 0.049
Bias Error: 0.4218893050353724
Variance Error: 0.0023894531890622946

Model 3:
CV accuracy: [0.65555556 0.53932584 0.56179775]
CV accuracy: 0.586 +/- 0.050
Bias Error: 0.41444028297960883
Variance Error: 0.0025338731634700624

Model 4:
CV accuracy: [0.65555556 0.53932584 0.59550562]
CV accuracy: 0.597 +/- 0.047
Bias Error: 0.40320432792342903
Variance Error: 0.002252389811667307



#### We can see that using bagged and random forest implementation the variance error reduces with a penalty increase on bias error!
---

In [24]:
least_bias_err = 1
least_variance_err = 1
for index, acc in enumerate(accuracy):
    if ((np.mean(1 - acc)) < least_bias_err):
        least_bias_err = (np.mean(1 - acc))
        index_bias = index
    if (np.var(acc) < least_variance_err):
        least_variance_err = np.var(acc)
        index_var = index
print(f'Model {index_bias + 1} has least bias: {least_bias_err}')
print(f'Model {index_var + 1} has least variance: {least_variance_err}')


Model 4 has least bias: 0.40320432792342903
Model 4 has least variance: 0.002252389811667307


### Therefore for the above data Random forests (Model 3) has the least variance and a slightly lesser bias error as compared to the best.
---