# AdaBoost Practice Activity

## Import data

In [149]:
import random
import pandas as pd

#Set the seed for analysis
random.seed(10)

#Import data 
mydata = pd.read_csv("/Users/jaredmcmullen/Desktop/GSB-S545/data/penguins_size.csv")

#Drop any na values 
mydata = mydata.dropna()

#print dataset
mydata

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


## Get the data into a format for the model to digest (X, Y)

In [152]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

#Use label encoder to get in the model ingesting format
for label in mydata.columns:
    mydata[label] = LabelEncoder().fit(mydata[label]).transform(mydata[label])

#Set X and Y values for the models
Y = mydata['species']
X = mydata.drop(['species'],axis=1)

## Import relevant packages and define the models for analysis

In [81]:
#Select all the models you want to test
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

#Define the stacking model used later on
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
    level0.append(('rf', RandomForestClassifier()))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

#Define the models that are going to be iterated through later on 
def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['rf'] = RandomForestClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['lr_boost'] = AdaBoostClassifier(base_estimator = LogisticRegression())
    models['cart_boost'] = AdaBoostClassifier(base_estimator = DecisionTreeClassifier())
    models['rf_boost'] = AdaBoostClassifier(base_estimator = RandomForestClassifier())

    models['stacking'] = get_stacking()
    return models

models = get_models()

## Create a method to cross validate and get accuracy measures

In [82]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

#Create a function to iterate through different models
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

## Go through all the models and run them and achieve an average score from the cross validation function above

In [80]:
#Supress warnings and set the seed
import warnings
warnings.filterwarnings("ignore")
random.seed(10)

#Iterate through the models and append the scores
from matplotlib import pyplot
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, Y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>stacking 0.994 (0.012)


## Choose the best model and continue to refine

In [88]:
#Define the stacking model used later on
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), algorithm='SAMME')))
    level0.append(('rf', AdaBoostClassifier(base_estimator = RandomForestClassifier(), algorithm='SAMME')))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

#Define the models that are going to be iterated through later on 
def get_models():
    models = dict()
    models['stacking'] = get_stacking()
    return models

models = get_models()

#Supress warnings and set the seed
import warnings
warnings.filterwarnings("ignore")
random.seed(10)

#Iterate through the models and append the scores
from matplotlib import pyplot
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, Y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>stacking 0.993 (0.015)


{'stacking': StackingClassifier(cv=5,
                    estimators=[('lr', LogisticRegression()),
                                ('knn', KNeighborsClassifier()),
                                ('cart',
                                 AdaBoostClassifier(algorithm='SAMME',
                                                    base_estimator=DecisionTreeClassifier())),
                                ('rf',
                                 AdaBoostClassifier(algorithm='SAMME',
                                                    base_estimator=RandomForestClassifier())),
                                ('svm', SVC()), ('bayes', GaussianNB())],
                    final_estimator=LogisticRegression())}

## Tune parameters - Num of Trees, Weak Learners, Learning Rate

In [140]:
#Define packages to import
from numpy import arange

#Define the models that are going to be iterated through later on 
def get_models():
    models = dict()
    
    #Explore number of trees
    n_trees = [10, 50, 100, 500, 1000, 5000]
    for n in n_trees:
        models[str(n)] = AdaBoostClassifier(base_estimator = RandomForestClassifier(), n_estimators=n, algorithm='SAMME')
    
    #Explore Week Learners
    for i in range(1,11):
    # define base model
        base = DecisionTreeClassifier(max_depth=i)
    # define ensemble model
        models[str(i)] = AdaBoostClassifier(base_estimator=base)
    
    #Explore learning rate
    for i in arange(0.1, 3, 0.1):
        key = '%.3f' % i
        models[key] = AdaBoostClassifier(learning_rate=i)

    models['stacking'] = get_stacking()
    return models

models = get_models()

#Supress warnings and set the seed
import warnings
warnings.filterwarnings("ignore")
random.seed(10)

#Iterate through the models and append the scores
from matplotlib import pyplot
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, Y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>10 0.973 (0.027)
>50 0.987 (0.020)
>100 0.984 (0.021)
>500 0.990 (0.016)
>1000 0.989 (0.018)
>5000 0.988 (0.017)
>1 0.814 (0.062)
>2 0.988 (0.018)
>3 0.984 (0.025)
>4 0.980 (0.024)
>5 0.965 (0.031)
>6 0.974 (0.031)
>7 0.967 (0.024)
>8 0.968 (0.026)
>9 0.965 (0.029)
>0.100 0.972 (0.032)
>0.200 0.966 (0.032)
>0.300 0.980 (0.022)
>0.400 0.983 (0.028)
>0.500 0.975 (0.028)
>0.600 0.984 (0.024)
>0.700 0.980 (0.023)
>0.800 0.984 (0.023)
>0.900 0.981 (0.025)
>1.000 0.813 (0.052)
>1.100 0.785 (0.046)
>1.200 0.794 (0.026)
>1.300 0.782 (0.058)
>1.400 0.803 (0.039)
>1.500 0.804 (0.066)
>1.600 0.856 (0.082)
>1.700 0.912 (0.078)
>1.800 0.934 (0.084)
>1.900 0.939 (0.082)
>2.000 0.929 (0.082)
>2.100 0.853 (0.097)
>2.200 0.805 (0.101)
>2.300 0.858 (0.111)
>2.400 0.892 (0.132)
>2.500 0.877 (0.112)
>2.600 0.927 (0.091)
>2.700 0.889 (0.119)
>2.800 0.901 (0.101)
>2.900 0.775 (0.149)
>stacking 0.993 (0.015)


## FINAL MODEL DEFINING AND BEST ACCURACY SCORE

In [148]:
#Start by plugging in all the values that we learned were better in the tuning above

#Define the stacking model used later on
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), algorithm='SAMME', learning_rate = .80)))
    level0.append(('rf', AdaBoostClassifier(base_estimator = RandomForestClassifier(), algorithm='SAMME', n_estimators = 1000 , learning_rate = .80)))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

#Define the models that are going to be iterated through later on 
def get_models():
    models = dict()
    models['stacking'] = get_stacking()
    return models

#Get the final models
models = get_models()

#Supress warnings and set the seed
import warnings
warnings.filterwarnings("ignore")
random.seed(10)

#Iterate through the models and append the scores
from matplotlib import pyplot
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, Y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>stacking 0.994 (0.014)
