<div style="position: relative; text-align: right;">
<img src="https://user-images.githubusercontent.com/7065401/98614301-dcf01780-22d6-11eb-9c8f-65ebfceac6f6.png" style="width: 130px; display: inline-block;"></img>

<img src="https://user-images.githubusercontent.com/7065401/98864025-08deda80-2448-11eb-9600-22aa17884cdf.png" style="height: 100%; max-height: inherit; position: absolute; top: 20%; left: 0px;"></img>
<br>

<h2 style="font-weight: bold;">
    Kristin Day
</h2>

<h3 style="color: #ef7d22; margin-top: 0.8em">
    Data Scientist
</h3>
<hr>
<br><br>

<p style="font-size: 80%; text-align: right; margin: 10px 0px;">
    yokristinday@gmail.com
</p>
<p style="font-size: 80%; text-align: right; margin: 10px 0px;">
    linkedin.com/in/kristin-day-300306a9
</p>

</div>

<br><br><br>

<div style="position: relative;">
<img src="https://user-images.githubusercontent.com/7065401/98728503-5ab82f80-2378-11eb-9c79-adeb308fc647.png"></img>

<h1 style="color: white; position: absolute; top:30%; left:10%;">
    Ensembles & Random Forests
</h1>

<h3 style="color: #ef7d22; font-weight: normal; position: absolute; top:43%; left:10%;">
    Kristin Day
</h3>
</div>

<div style="width: 100%; background-color: #222; text-align: center">
<br><br>

<h1 style="color: white; font-weight: bold;">
    Project
</h1>
    
<h3 style="color: #ef7d22; font-weight: normal;">
    Build an ensemble voting classifier
</h3>

<br><br> 
</div>

![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

In [1]:
# Import packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier,VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer,StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
X, y = load_iris(return_X_y=True)

# Split data into test and train - stratify on y
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

# Normalize and Scale numeric data
norm = Normalizer()
scaler = StandardScaler()

X_train_norm = norm.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_norm)

X_test_norm = norm.transform(X_test)
X_test_scaled = scaler.transform(X_test_norm)

# Train classifiers
nbc = GaussianNB()
mlpc = MLPClassifier(random_state=42)
dtc = DecisionTreeClassifier(random_state=42)

nbc.fit(X_train_scaled, y_train)
mlpc.fit(X_train_scaled, y_train)
dtc.fit(X_train_scaled, y_train)

DecisionTreeClassifier(random_state=42)

In [3]:
eclf = VotingClassifier([('nbc', nbc), ('mlpc', mlpc), ('dtc', dtc)],voting='soft',n_jobs=-1)
eclf.fit(X_train_scaled, y_train)

VotingClassifier(estimators=[('nbc', GaussianNB()),
                             ('mlpc', MLPClassifier(random_state=42)),
                             ('dtc', DecisionTreeClassifier(random_state=42))],
                 n_jobs=-1, voting='soft')

In [5]:
# Scores for training accuracy
nbc.score(X_train_scaled,y_train),mlpc.score(X_train_scaled,y_train),dtc.score(X_train_scaled,y_train)

(0.975, 0.9583333333333334, 1.0)

In [6]:
nbc.score(X_test_scaled,y_test), mlpc.score(X_test_scaled,y_test), dtc.score(X_test_scaled,y_test)

(0.9666666666666667, 0.9666666666666667, 0.9666666666666667)

In [7]:
eclf.score(X_train_scaled,y_train), eclf.score(X_test_scaled,y_test)

(0.9833333333333333, 0.9666666666666667)

In [8]:
# Stacking
# Create data for stacking
df_stack_train = pd.DataFrame()
df_stack_train['nbc'] = [p[1] for p in nbc.predict_proba(X_train_scaled)]
df_stack_train['mlpc'] = [p[1] for p in mlpc.predict_proba(X_train_scaled)]
df_stack_train['dtc'] = [p[1] for p in dtc.predict_proba(X_train_scaled)]
df_stack_train['target'] = y_train

# Fit a logistic regression model to the data
mlpc_ens = MLPClassifier(random_state=42)
X_train_stack = df_stack_train[['nbc','mlpc','dtc']].copy()
mlpc_ens.fit(X_train_stack,y_train)

# Get your training accuracy
mlpc_ens.score(X_train_stack,y_train)

0.8583333333333333

In [9]:
# Create test data for stacking
df_stack_test = pd.DataFrame()
df_stack_test['nbc'] = [p[1] for p in nbc.predict_proba(X_test_scaled)]
df_stack_test['mlpc'] = [p[1] for p in mlpc.predict_proba(X_test_scaled)]
df_stack_test['dtc'] = [p[1] for p in dtc.predict_proba(X_test_scaled)]
df_stack_test['target'] = y_test

# Fit a logistic regression model to the data
X_test_stack = df_stack_test[['nbc','mlpc','dtc']].copy()

# Get your test accuracy
mlpc_ens.score(X_test_stack,y_test)

0.8666666666666667

In [10]:
# You can also stack using the sklearn library
# The benefit of using the StackingClassifier is that is uses cross-validation on the base layers 
# which should help with overfitting
estimators = [
    ('nbc',GaussianNB()),
    ('mlpc',MLPClassifier(random_state=42)),
    ('dtc',DecisionTreeClassifier(random_state=42))
]
stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=MLPClassifier(random_state=42),
    cv=3)
stack_clf.fit(X_train_scaled,y_train)

# Get your train and test accuracy
stack_clf.score(X_train_scaled,y_train),stack_clf.score(X_test_scaled,y_test)

(0.9833333333333333, 0.9666666666666667)

<div style="position: relative;">
<img src="https://user-images.githubusercontent.com/7065401/98729912-57be3e80-237a-11eb-80e4-233ac344b391.png"></img>
</div>