# XGBoost Activity

In [54]:
import random
import pandas as pd

#Set the seed for analysis
random.seed(10)

#Import data 
mydata = pd.read_csv("/Users/jaredmcmullen/Desktop/GSB-S545/data/penguins_size.csv")

#Drop any na values 
mydata = mydata.dropna()

#print dataset
mydata

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


## Import XGB and get variables for models

In [19]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

for label in mydata.columns:
    mydata[label] = LabelEncoder().fit(mydata[label]).transform(mydata[label])
Y = mydata['species']
X = mydata.drop(['species'],axis=1)

## Import packages and get baseline models 

In [41]:
#Select all the models you want to test
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

#Define the stacking model used later on
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
    level0.append(('rf', RandomForestClassifier()))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

#Define the models that are going to be iterated through later on 
def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['rf'] = RandomForestClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['lr_boost'] = AdaBoostClassifier(base_estimator = LogisticRegression())
    models['cart_boost'] = AdaBoostClassifier(base_estimator = DecisionTreeClassifier())
    models['rf_boost'] = AdaBoostClassifier(base_estimator = RandomForestClassifier())
    models['lr_xgboost'] = XGBClassifier(base_estimator = LogisticRegression())
    models['cart_xgboost'] = XGBClassifier(base_estimator = DecisionTreeClassifier())
    models['rf_xgboost'] = XGBClassifier(base_estimator = RandomForestClassifier())
    models['stacking'] = get_stacking()
    return models

models = get_models()

In [42]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

#Create a function to iterate through different models
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

## Run through all baseline models and find the best performers amongst classification models

In [44]:
#Supress warnings and set the seed
import warnings
warnings.filterwarnings("ignore")
random.seed(10)

#Iterate through the models and append the scores
from matplotlib import pyplot
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, Y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>lr 0.994 (0.014)
>knn 0.987 (0.020)
>cart 0.963 (0.039)
>rf 0.990 (0.018)
>svm 0.986 (0.017)
>bayes 0.925 (0.035)
>lr_boost 0.916 (0.065)
>cart_boost 0.967 (0.029)
>rf_boost 0.987 (0.021)
>lr_xgboost 0.986 (0.018)
>cart_xgboost 0.985 (0.019)
>rf_xgboost 0.987 (0.017)
>stacking 0.994 (0.012)


## Stacking is the best performer, so put all of the best scoring models in the stacking model

In [53]:
#Define the stacking model used later on
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart_xgboost', XGBClassifier(base_estimator = DecisionTreeClassifier())))
    level0.append(('rf', RandomForestClassifier()))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

#Define the models that are going to be iterated through later on 
def get_models():
    models = dict()
    models['stacking'] = get_stacking()
    return models

models = get_models()

#Iterate through the models and append the scores
from matplotlib import pyplot
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, Y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>stacking 0.995 (0.011)
