In [None]:
# Proceso para entrenar un modelo gradient boosting y elección de hiperparametros

In [1]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import ShuffleSplit

In [5]:
path_X_train = "../pickle/X_train.pickle"
with open(path_X_train, "rb") as data:
    X_train = pickle.load(data)


path_y_train = "../pickle/y_train.pickle"
with open(path_y_train, "rb") as data:
    y_train = pickle.load(data)


path_X_test = "../pickle/X_test.pickle"
with open(path_X_test, "rb") as data:
    X_test = pickle.load(data)


path_y_test = "../pickle/y_test.pickle"
with open(path_y_test, "rb") as data:
    y_test = pickle.load(data)

In [6]:
# Extracción parametros para ver cuáles se pueden modificar

gb_params = GradientBoostingClassifier(random_state = 8)
print(gb_params.get_params())

{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'auto', 'random_state': 8, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [7]:
# Elección de algunos parametros para determinar con cuáles puede funcionar mejor el modelo
# n_estimators
n_estimators = [200, 800]

# max_features
max_features = ['auto', 'sqrt']

# max_depth
max_depth = [10, 40]
max_depth.append(None)

# min_samples_split
min_samples_split = [10, 30, 50]

# min_samples_leaf
min_samples_leaf = [1, 2, 4]

# learning rate
learning_rate = [.1, .5]

# subsample
subsample = [.5, 1.]

# random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate,
               'subsample': subsample}

print(random_grid)

{'n_estimators': [200, 800], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 40, None], 'min_samples_split': [10, 30, 50], 'min_samples_leaf': [1, 2, 4], 'learning_rate': [0.1, 0.5], 'subsample': [0.5, 1.0]}


In [8]:
# Modelo base
gbc = GradientBoostingClassifier(random_state=8)


random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [7]:
print("Los mejores parametros son:")
print(random_search.best_params_)
print("\n")
print("La mean accuracy con esos parametros es: ")
print(random_search.best_score_)

The best hyperparameters from Random Search are:
{'subsample': 1.0, 'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.1}

The mean accuracy of a model with these hyperparameters is:
0.6969581749049429


In [9]:
max_depth = [5, 10, 15]
max_features = ['sqrt']
min_samples_leaf = [1]
min_samples_split = [10, 20]
n_estimators = [200]
learning_rate = [.1, .5]
subsample = [.5]

param_grid = {
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
    'learning_rate': learning_rate,
    'subsample': subsample

}

# Create a base model
gbc = GradientBoostingClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gbc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   21.2s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_sa...
                                                  tol=0.0001,
                                                  validation_fraction=0.1,
                                                  verbose=0, warm_st

In [10]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'learning_rate': 0.1, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 200, 'subsample': 0.5}

The mean accuracy of a model with these hyperparameters is:
0.6824116743471582


In [12]:
best_gbc = random_search.best_estimator_

In [13]:
best_gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=10,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=10,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=8, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [14]:
y_pred = best_gbc.predict(X_test)

In [15]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(y_train, best_gbc.predict(X_train)))

The training accuracy is: 
0.9821292775665399


In [16]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test, y_pred))

The test accuracy is: 
0.7036474164133738


In [17]:
# Classification report
print("Classification report")
print(classification_report(y_test,y_pred))

Classification report
              precision    recall  f1-score   support

           0       0.72      0.79      0.75       371
           1       0.68      0.60      0.64       287

    accuracy                           0.70       658
   macro avg       0.70      0.69      0.69       658
weighted avg       0.70      0.70      0.70       658



In [18]:
base_model = GradientBoostingClassifier(random_state = 8)
base_model.fit(X_train, y_train)
accuracy_score(y_test, base_model.predict(X_test))

0.7127659574468085

In [19]:
best_gbc.fit(X_train, y_train)
accuracy_score(y_test, best_gbc.predict(X_test))

0.7036474164133738

In [20]:
with open("models/best_gbc.pickle", "wb") as output:
    pickle.dump(base_model, output)