Proceso para entrenar un modelo gradient boosting y elección de hiperparametros

In [31]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import ShuffleSplit
from yellowbrick.classifier import ClassificationReport
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
path_X_train = "../pickle/X_train.pickle"
with open(path_X_train, "rb") as data:
    X_train = pickle.load(data)


path_y_train = "../pickle/y_train.pickle"
with open(path_y_train, "rb") as data:
    y_train = pickle.load(data)


path_X_test = "../pickle/X_test.pickle"
with open(path_X_test, "rb") as data:
    X_test = pickle.load(data)


path_y_test = "../pickle/y_test.pickle"
with open(path_y_test, "rb") as data:
    y_test = pickle.load(data)

In [3]:
# Extracción parametros para ver cuáles se pueden modificar

gb_params = GradientBoostingClassifier(random_state = 8)
print(gb_params.get_params())

{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'auto', 'random_state': 8, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [4]:
# Elección de algunos parametros para determinar con cuáles puede funcionar mejor el modelo
# n_estimators
n_estimators = [200, 800]

# max_features
max_features = ["auto", "sqrt"]

# max_depth
max_depth = [10, 40]
max_depth.append(None)

# min_samples_split
min_samples_split = [10, 30, 50]

# min_samples_leaf
min_samples_leaf = [1, 2, 4]

# learning rate
learning_rate = [0.1, 0.5]

# subsample
subsample = [0.5, 1.0]

# random grid
random_grid = {"n_estimators": n_estimators,
               "max_features": max_features,
               "max_depth": max_depth,
               "min_samples_split": min_samples_split,
               "min_samples_leaf": min_samples_leaf,
               "learning_rate": learning_rate,
               "subsample": subsample}

In [5]:
# Modelo base

gbc = GradientBoostingClassifier(random_state = 8)


random_search = RandomizedSearchCV(estimator = gbc,
                                   param_distributions = random_grid,
                                   n_iter = 50,
                                   scoring = "accuracy",
                                   cv = 3, 
                                   verbose = 1, 
                                   random_state = 8)

# Fit
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 21.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                     

In [6]:
print("Los mejores parametros son:")
print(random_search.best_params_, "\n")
print("Mean accuracy = :", random_search.best_score_)

Los mejores parametros son:
{'subsample': 1.0, 'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.1} 

Mean accuracy = : 0.6969581749049429


In [7]:
max_depth = [5, 10, 15]
max_features = ["sqrt"]
min_samples_leaf = [1]
min_samples_split = [10, 20]
n_estimators = [200]
learning_rate = [0.1, 0.5]
subsample = [0.5]

param_grid = {
    "max_depth": max_depth,
    "max_features": max_features,
    "min_samples_leaf": min_samples_leaf,
    "min_samples_split": min_samples_split,
    "n_estimators": n_estimators,
    "learning_rate": learning_rate,
    "subsample": subsample

}

# modelo base
gbc = GradientBoostingClassifier(random_state=8)

# Cross Validation
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)


grid_search = GridSearchCV(estimator = gbc, 
                           param_grid = param_grid,
                           scoring = "accuracy",
                           cv = cv_sets,
                           verbose = 1)


grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   22.1s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_sa...
                                                  tol=0.0001,
                                                  validation_fraction=0.1,
                                                  verbose=0, warm_st

In [8]:
print("Los mejores parametros son: ")
print(grid_search.best_params_, "\n")
print("Mean accuracy = :", grid_search.best_score_)

Los mejores parametros son: 
{'learning_rate': 0.1, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 200, 'subsample': 0.5} 

Mean accuracy = : 0.6824116743471582


In [9]:
best_gbc = random_search.best_estimator_

In [10]:
best_gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=10,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=10,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=8, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [11]:
y_pred = best_gbc.predict(X_test)

In [26]:
# Training accuracy
print("Training accuracy =", accuracy_score(y_train, best_gbc.predict(X_train)))

Training accuracy = 0.9821292775665399


In [25]:
# Test accuracy
print("Test accuracy =", accuracy_score(y_test, y_pred))

Test accuracy =  0.7036474164133738


In [29]:
report = classification_report(y_test, y_pred, output_dict=True)

In [33]:
df = pd.DataFrame(report).transpose()

In [34]:
df

Unnamed: 0,f1-score,precision,recall,support
0,0.749679,0.715686,0.787062,371.0
1,0.636872,0.684,0.595819,287.0
accuracy,0.703647,0.703647,0.703647,0.703647
macro avg,0.693275,0.699843,0.69144,658.0
weighted avg,0.700476,0.701866,0.703647,658.0


In [36]:
base_model = GradientBoostingClassifier(random_state = 8)
base_model.fit(X_train, y_train)
accuracy_score(y_test, base_model.predict(X_test))

0.7127659574468085

In [37]:
best_gbc.fit(X_train, y_train)
accuracy_score(y_test, best_gbc.predict(X_test))

0.7036474164133738

In [38]:
with open("../Modelos/best_gbc.pickle", "wb") as output:
    pickle.dump(base_model, output)