## 02 Gradient boosting classifier

In [None]:
# import libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Gradient boosting classifier with default parameters

In [None]:
# declare vector and target variable
X = rt_data.drop(['reinfection'], axis=1)
y = rt_data['reinfection']

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

# instantiate the classifier 
gb = GradientBoostingClassifier(random_state=0)

# fit the model
gb.fit(X_train, y_train)

# Predict the Test set results
y_pred = gb.predict(X_test)

from sklearn.metrics import accuracy_score
print('Model accuracy score with default parameters : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

### Nested cross validation

In [None]:
# declare feature vector and target variable
X = rt_data.drop(['reinfection'], axis=1)
y = rt_data['reinfection']

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=0)

## All
outer_results = list()

## assign array for predictions
y_test_all = np.array([])
yhat_all = np.array([])

for train_ix, test_ix in cv_outer.split(X):
    
    # split data
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=0)
    
    # define the model
    gb = GradientBoostingClassifier(random_state=0)
    
    # define search space
    space = dict()
    space['n_estimators'] = [50, 100, 200, 300, 500]
    space['max_depth'] = [5, 10, 15, 20]
    space['learning_rate'] = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
    space['max_features'] = ['sqrt', 'log2', 5, 10, 15, 20]
    space['criterion'] = ['friedman_mse', 'mse']

    # define search
    search = GridSearchCV(gb, space, scoring='accuracy', cv=cv_inner, refit=True)
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)

    # evaluate the accuracy of the model
    acc = accuracy_score(y_test, yhat)
    
    # store the accracy result
    outer_results.append(acc)
    
    # report progress
    print('>acc=%.4f, est=%.4f, cfg=%s' % (acc, result.best_score_, result.best_params_))

    # to get the classification report
    y_test_all = np.append(y_test_all, y_test) 
    yhat_all = np.append(yhat_all, yhat)
    print(classification_report(y_test_all, yhat_all, digits=4))
    print(confusion_matrix(y_test_all, yhat_all))

# print summarized estimated performance of the model
print('Accuracy: %.4f (%.4f)' % (mean(outer_results), std(outer_results)))

In [None]:
# declare feature vector and target variable
X = rt_data.drop(['reinfection'], axis=1)
y = rt_data['reinfection']

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=0)

outer_results = list()

# configure the cross-validation procedure
cv_inner = KFold(n_splits=3, shuffle=True, random_state=0)
    
# define the model
gbc = GradientBoostingClassifier(random_state=0)
    
# define search space
space = dict()
space['n_estimators'] = [50, 100, 200, 300, 500]
space['max_depth'] = [5, 10, 15, 20]
space['learning_rate'] = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
space['max_features'] = ['sqrt', 'log2', 5, 10, 15, 20]
space['criterion'] = ['friedman_mse', 'mse']

# define search
search = GridSearchCV(gbc, space, scoring='accuracy', cv=cv_inner, refit=True)
# execute search
result = search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
best_model = result.best_estimator_

# Get parameters of the best model
print(best_model)