# Ensemble Methods Applied

Agenda:
- Review code for Voting Classifier, Bagging Classifier, and Random Forest
- Practice finding optimal hyperparameter for  Random Forest with gridsearch


## Import and Prep Titanic dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier


In [2]:
# Read in data and split data to be used in the models
titanic = pd.read_csv('https://raw.githubusercontent.com/learn-co-students/nyc-ds-033020-lectures/master/Mod_3/decision_trees/cleaned_titanic.csv', index_col='PassengerId')



In [3]:
# Create matrix of features
X = titanic.drop('Survived', axis = 1) # grabs everything else but 'Survived'

# Create target variable
y = titanic['Survived'] # y is the column we're trying to predict

# Create a list of the features being used in the 
feature_cols = X.columns

In [4]:
# Use x and y variables to split the training data into train and test set then scale that data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

## Fit a KNN model

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
knn = KNeighborsClassifier(n_neighbors=9)

In [7]:
knn.fit(X_train, y_train)

knn_preds = knn.predict(X_test)

knn_f1 = metrics.f1_score(y_test, knn_preds)


print(knn_f1)

0.7975460122699386


## Fit a Logistic Regression model 

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
lr = LogisticRegression(class_weight='balanced')

In [10]:
lr.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [11]:
lr_preds = lr.predict(X_test)

lr_f1 = metrics.f1_score(y_test, lr_preds)

print(lr_f1)

0.8066298342541436


## Fit a Decision Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dtc = DecisionTreeClassifier(max_depth=5, class_weight='balanced')

dtc.fit(X_train, y_train)

dtc_preds  = dtc.predict(X_test)

dtc_f1 = metrics.f1_score(y_test, dtc_preds)

print(dtc_f1)

0.8047337278106509


## Combine three models using Voting Classifier

In [14]:
from sklearn.ensemble import VotingClassifier


For the estimators, we must provide a list of tuples. The first value in the tuple is is the name given to the model/estimator in the second value. SKlearn requires this because there is additional functionality where you can access information about the specific models, so you need to name the models to access them later.  

In [15]:
voting_clf = VotingClassifier(
                estimators=[('logreg', lr), ('knneighbors', knn), ('decisiontree', dtc)], 
                voting='hard')

voting_clf.fit(X_train, y_train)

vc_preds = voting_clf.predict(X_test)

vc_f1 = metrics.f1_score(y_test, vc_preds)

print(vc_f1)

0.8160919540229884


### Use a voting classifier with multiple Logistic regression models 

In [16]:

C_param_range = [0.001,0.01,0.1,0.5,1]
titles = ['lr_0_001', 'lr_0_01', 'lr_0_1', 'lr_0.5', 'lr_1']

params = dict(zip(titles, C_param_range)) 
models = {}

table = pd.DataFrame(columns = ['C_parameter','F1'])
table['C_parameter'] = C_param_range
j = 0

for k , v  in params.items():
    
    # Create model using different value for c  
    lr = LogisticRegression(penalty = 'l2', C = v, random_state = 1, class_weight='balanced')
    
    #save the model to a dictionary to use later in our voting classifiers
    models[k]= lr
    
    #the steps below this point are unnecessary in order to create a voting classifier, 
    #but it is easy to fit the model and see how performance changes for different levels of regularization
    lr.fit(X_train, y_train)
    
    # Predict using model
    y_preds = lr.predict(X_test)

    # Saving accuracy score in table
    table.iloc[j,1] = metrics.f1_score(y_test, y_preds)
    j += 1



In [17]:
#review performance for different levels of C
table


Unnamed: 0,C_parameter,F1
0,0.001,0.735135
1,0.01,0.751381
2,0.1,0.804469
3,0.5,0.80663
4,1.0,0.80663


In [21]:
#invesitgate the models D=dictionary
list(models.items())

[('lr_0_001',
  LogisticRegression(C=0.001, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('lr_0_01',
  LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('lr_0_1',
  LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=1, solver='lbfgs',

Now that we have programmatically created multiple logistic regression models, let's use them in an ensemble model

In [18]:
lr_voting = VotingClassifier(estimators=list(models.items()), 
                              voting='hard')

lr_voting.fit(X_train, y_train)

lrv_preds = lr_voting.predict(X_test)

lrv_f1 = metrics.f1_score(y_test, lrv_preds)

print(lrv_f1)

0.8044692737430168


Not so much better than our best log reg model before. because even though using different C params , it is the same obs so the results is not so much different from the indiv logistic regression models.

## Fit a Bagging Classifier for a Logistic Regression model. 

In [19]:
X_train.shape

(666, 9)

In [47]:
bc_lr = BaggingClassifier(
            base_estimator=LogisticRegression(random_state = 1, class_weight='balanced'), 
            n_estimators= 1000,
            max_samples= 0.8,
            max_features= 8,
            oob_score= True
                )

In [51]:
bc_knn = BaggingClassifier(
            base_estimator=KNeighborsClassifier(n_neighbors=9), 
            n_estimators= 1000,
            max_samples= 0.8,
            max_features= 8,
            oob_score= True
                )

In [53]:
bc_dtc = BaggingClassifier(
            base_estimator=DecisionTreeClassifier(random_state = 1, max_depth = 5, class_weight='balanced'), 
            n_estimators= 1000,
            max_samples= 0.8,
            max_features= 8,
            oob_score= True
                )

In [55]:
voting_clf_bagged = VotingClassifier(
                estimators=[('bagged_logreg', bc_lr), ('bagged_knneighbors', bc_knn), ('bagged_decisiontree', bc_dtc)], 
                voting='hard')

In [56]:
voting_clf_bagged.fit(X_train, y_train)

VotingClassifier(estimators=[('bagged_logreg',
                              BaggingClassifier(base_estimator=LogisticRegression(class_weight='balanced',
                                                                                  random_state=1),
                                                max_features=8, max_samples=0.8,
                                                n_estimators=1000,
                                                oob_score=True)),
                             ('bagged_knneighbors',
                              BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=9),
                                                max_features=8, max_samples=0.8,
                                                n_estimators=1000,
                                                oob_score=True)),
                             ('bagged_decisiontree',
                              BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
   

In [59]:
voting_clf_bagged_preds = voting_clf_bagged.predict(X_test)

voting_clf_bagged_preds_f1 = metrics.f1_score(y_test, voting_clf_bagged_preds)

print(voting_clf_bagged_preds_f1)

0.8208092485549133


In [None]:
voting_clf_bagged()

In [48]:
bc_lr.fit(X_train, y_train)



BaggingClassifier(base_estimator=LogisticRegression(class_weight='balanced',
                                                    random_state=1),
                  max_features=8, max_samples=0.8, n_estimators=1000,
                  oob_score=True)

In [49]:
# Use the oob_score to get some idea of how the model performs on a validation set

bc_lr.oob_score_
# Could be an acuracy score and not a f1 score... no specification in param
#so uses the more common scoring method for the estimator

0.7792792792792793

In [50]:
# See how the model performs on the test set

bc_lr_preds = bc_lr.predict(X_test)

bc_lr_f1 = metrics.f1_score(y_test, bc_lr_preds)

print(bc_lr_f1)

0.8066298342541436


***What is the difference in the `VotingClassifier` algorithm and the `BaggingClassifier` algorithm?***

Your answer: the difference between the Voting Classifier (slightly different models on the same observations) and BaggingClassifier is a model on the base_estimator run n_estimators times with different samples set in max_samples and with random feature with each model set with the max_features 

**What is the difference between a BaggingClassifier that uses a decision tree as the base estimator and a Random Forest Classifier?**

A random forest classifier will take a sample of features at each node, where as a bagging classifier will take a sample of features at to use for the whole model. 

# Fitting a Random Forest Classifier

In [72]:
# Instantiate the classifier using 100 trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators=10000, max_depth=5, max_features=4)

In [73]:
#let's look at all the different default features
rfc

RandomForestClassifier(max_depth=5, max_features=4, n_estimators=10000,
                       random_state=1)

In [74]:
#fit the model to the training data
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, max_features=4, n_estimators=10000,
                       random_state=1)

In [75]:
#use the fitted model to predict on the test data
rfc_preds = rfc.predict(X_test)

rfc_f1 = metrics.f1_score(y_test, rfc_preds)

# checking accuracy on the test data
print('Test F1 score: ', rfc_f1)

Test F1 score:  0.763157894736842


***Increase the number of trees and see how the model performs***

### GridsearchCV with Random Forest

Let's use grid search to identify the best tuning parameters to use for a random forest model. 

In [78]:
from sklearn.model_selection import GridSearchCV

In [79]:
RandomForestClassifier()

RandomForestClassifier()

In [85]:
#create a dictionary of all the parameters you want to tune
param_grid = { 
    'n_estimators': [100,300,500,700,1000],
#     'min_samples_split' : range(2,7) ,
    'max_leaf_nodes' : range(18, 44, 2),
#     'min_impurity_split': range(2,5),
#     'criterion': ['gini', 'entropy'],
#     'min_samples_leaf': range(2,10),
#     'max_features': ['auto', 5,6, None, 0.08]
   
#     'criterion': ['gini', 'entropy'],
#     'max_depth': list(range(2,10)),
#     'max_features': list(range(3,7))
}

In [86]:
#create a grid search object and fit it to the data

grid_tree=GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [87]:
grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 65 candidates, totalling 325 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 325 out of 325 | elapsed:  3.9min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_leaf_nodes': range(18, 44, 2),
                         'n_estimators': [100, 300, 500, 700, 1000]},
             scoring='f1', verbose=1)

In [88]:
### Identify the best params 



# Single best score achieved across all params (min_samples_split)
print(grid_tree.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_tree.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_tree.best_estimator_)
#Identify the best score during fitting with cross-validation


0.7546693073287798
{'max_leaf_nodes': 36, 'n_estimators': 300}
RandomForestClassifier(max_leaf_nodes=36, n_estimators=300)


In [89]:
#Predict the response for test dataset
y_pred = grid_tree.best_estimator_.predict(X_test)

# Model F1, how often is the classifier correct?
print("F1:",metrics.f1_score(y_test, y_pred))

F1: 0.7898089171974522
