In [1]:
import pandas
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Load data and split the data

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
y = array[:,8]

In [3]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

## Tuning RandomForest

In [4]:
rf = RandomForestClassifier()

param_grid_rf = [{'n_estimators': [10, 20, 50], 'max_depth': [5, 10]}]

gs_rf = GridSearchCV(estimator=rf, 
                  param_grid=param_grid_rf, 
                  scoring='accuracy', 
                  cv=10,
                  refit=True)

gs_rf = gs_rf.fit(X_train, y_train)

print(gs_rf.best_score_)
print(gs_rf.best_params_)

0.781758957655
{'max_depth': 5, 'n_estimators': 10}


## Tuning Gradient Boosting Trees

In [5]:
gbt = GradientBoostingClassifier()

param_grid_gbt = [{'n_estimators': [10, 20, 50], 'max_depth': [5, 10], 'learning_rate': [0.1, 0.01]}]

gs_gbt = GridSearchCV(estimator=gbt, 
                  param_grid=param_grid_gbt, 
                  scoring='accuracy', 
                  cv=10,
                  refit=True)

gs_gbt = gs_gbt.fit(X_train, y_train)

print(gs_gbt.best_score_)
print(gs_gbt.best_params_)

0.755700325733
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 20}


In [6]:
#select best estimator for each algorithm
best_rf = gs_rf.best_estimator_
best_gbt = gs_gbt.best_estimator_

## Obtain test set performance

In [7]:
y_pred_rf_test = best_rf.predict(X_test)
y_pred_gbt_test = best_gbt.predict(X_test)

y_pred_rf_train = best_rf.predict(X_train)
y_pred_gbt_train = best_gbt.predict(X_train)

print("Training Error")
print('Random Forest Train Accuracy: %.3f' % accuracy_score(y_true=y_train, y_pred=y_pred_rf_train))
print('Gradient Boosting Trees Train Accuracy: %.3f' % accuracy_score(y_true=y_train, y_pred=y_pred_gbt_train))
print("---------------")
print("Testing Error")
print('Random Forest Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred_rf_test))
print('Gradient Boosting Trees Test Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred_gbt_test))

Training Error
Random Forest Train Accuracy: 0.845
Gradient Boosting Trees Train Accuracy: 0.930
---------------
Testing Error
Random Forest Test Accuracy: 0.753
Gradient Boosting Trees Test Accuracy: 0.727
