In [49]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [50]:
df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()

# Definine outcome and predictors.
# Set our outcome to 0 and 1.
y = df['partner'] - 1
X = df.loc[:, ~df.columns.isin(['partner', 'cntry', 'idno'])]

# Make the categorical variable 'country' into dummies.
X = pd.concat([X, pd.get_dummies(df['cntry'])], axis=1)

# Create training and test sets.
offset = int(X.shape[0] * 0.8)

# Put 80% of the data in the training set.
X_train, y_train = X[:offset], y[:offset]

# And put 10% in the test set.
X_test, y_test = X[offset:], y[offset:]

From example

Training set accuracy: <br/>
Percent Type I errors: 0.04650845608292417<br/>
Percent Type II errors: 0.17607746863066012

Test set accuracy:<br/>
Percent Type I errors: 0.06257668711656442<br/>
Percent Type II errors: 0.18527607361963191

# Testing iteration changes

In [51]:
# 100 iteration
params = {'n_estimators': 100,
          'max_depth': 5,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_1 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_1 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_1 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_1 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [52]:
# 500 iterations, 
params = {'n_estimators': 500,
          'max_depth': 5,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_2 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_2 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_2 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_2 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [53]:
# 700 iterations
params = {'n_estimators': 700,
          'max_depth': 5,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_3 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_3 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_3 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_3 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [54]:
# 1000 iterations, changed max depth to 5
params = {'n_estimators': 1000,
          'max_depth': 5,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_4 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_4 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_4 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_4 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [55]:
print((
    'Training set accuracy:\n'
    'Percent Type I errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}\n\n'
    'Percent Type II errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}\n\n'
    'Percent Type II errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}'
).format(train_tI_errors_1, train_tI_errors_2, train_tI_errors_3, train_tI_errors_4,
         train_tII_errors_1, train_tII_errors_2, train_tII_errors_3, train_tII_errors_4,
         test_tI_errors_1, test_tI_errors_2, test_tI_errors_3, test_tI_errors_4,
         test_tII_errors_1, test_tII_errors_2, test_tII_errors_3, test_tII_errors_4))

Training set accuracy:
Percent Type I errors:
0.0261
0.0049
0.0012
0.0003

Percent Type II errors:
0.1495
0.0640
0.0381
0.0129

Test set accuracy:
Percent Type I errors:
0.1141
0.1184
0.1399
0.1233

Percent Type II errors:
0.1411
0.1552
0.1448
0.1534


more iterations the better - looking at training data

setting iterations to 1000

# Testing Max Depth

In [78]:
# 5 max_depth
params = {'n_estimators': 1000,
          'max_depth': 5,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_1 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_1 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_1 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_1 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [79]:
# 20 max_depth 
params = {'n_estimators': 1000,
          'max_depth': 20,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_2 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_2 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_2 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_2 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [80]:
# 50 max_depth
params = {'n_estimators': 1000,
          'max_depth': 50,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_3 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_3 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_3 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_3 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [81]:
# 100 max_depth
params = {'n_estimators': 1000,
          'max_depth': 100,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_4 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_4 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_4 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_4 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [82]:
print((
    'Training set accuracy:\n'
    'Percent Type I errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}\n\n'
    'Percent Type II errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}\n\n'
    'Percent Type II errors:\n'
    '{:.4f}\n{:.4f}\n{:.4f}\n{:.4f}'
).format(train_tI_errors_1, train_tI_errors_2, train_tI_errors_3, train_tI_errors_4,
         train_tII_errors_1, train_tII_errors_2, train_tII_errors_3, train_tII_errors_4,
         test_tI_errors_1, test_tI_errors_2, test_tI_errors_3, test_tI_errors_4,
         test_tII_errors_1, test_tII_errors_2, test_tII_errors_3, test_tII_errors_4))

Training set accuracy:
Percent Type I errors:
0.0003
0.0000
0.0000
0.0000

Percent Type II errors:
0.0123
0.0000
0.0000
0.0000

Test set accuracy:
Percent Type I errors:
0.1411
0.1190
0.1595
0.1613

Percent Type II errors:
0.1405
0.1687
0.1644
0.1663


# Testing exponential

In [83]:
params = {'n_estimators': 1000,
          'max_depth': 5,
          'loss': 'exponential'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors_2 = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors_2 = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors_2 = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors_2 = table_test.loc[1.0,0.0]/table_test.loc['All','All']

In [85]:
print((
    'Training set accuracy:\n'
    'Percent Type I errors:\n'
    '{:.4f}\n\n'
    'Percent Type II errors:\n'
    '{:.4f}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors:\n'
    '{:.4f}\n\n'
    'Percent Type II errors:\n'
    '{:.4f}'
).format(train_tI_errors_2, 
        train_tII_errors_2,
        test_tI_errors_2,
        test_tII_errors_2))

Training set accuracy:
Percent Type I errors:
0.0008

Percent Type II errors:
0.0153

Test set accuracy:
Percent Type I errors:
0.1067

Percent Type II errors:
0.1638


Exponential is slightly better than deviance 

Compared to beginning example the training set has very little errors and the test set has improved

From example

Training set accuracy: <br/>
Percent Type I errors: 0.04650845608292417<br/>
Percent Type II errors: 0.17607746863066012

Test set accuracy:<br/>
Percent Type I errors: 0.06257668711656442<br/>
Percent Type II errors: 0.18527607361963191