Random Forest

In [19]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import pandas as pd

In [2]:

path_X_train = "../Pickle/X_train.pickle"
with open(path_X_train, "rb") as data:
    X_train = pickle.load(data)


path_y_train = "../Pickle/y_train.pickle"
with open(path_y_train, "rb") as data:
    y_train = pickle.load(data)


path_X_test = "../Pickle/X_test.pickle"
with open(path_X_test, "rb") as data:
    X_test = pickle.load(data)


path_y_test = "../Pickle/y_test.pickle"
with open(path_y_test, "rb") as data:
    y_test = pickle.load(data)

In [3]:
# Extracción parametros para ver cuáles se pueden modificar

rf_params = RandomForestClassifier(random_state = 8)
print(rf_params.get_params())

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 'warn', 'n_jobs': None, 'oob_score': False, 'random_state': 8, 'verbose': 0, 'warm_start': False}


In [4]:
# Elección de algunos parametros para determinar con cuáles puede funcionar mejor el modelo
# n_estimators
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# max_features
max_features = ['auto', 'sqrt']

# max_depth
max_depth = [int(x) for x in np.linspace(20, 100, num = 5)]
max_depth.append(None)

# min_samples_split
min_samples_split = [2, 5, 10]

# min_samples_leaf
min_samples_leaf = [1, 2, 4]

# bootstrap
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [20, 40, 60, 80, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [5]:
# Modelo base

rfc = RandomForestClassifier(random_state=8, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rfc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  8.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=-1, 

In [7]:
print("Los mejores parametros son:")
print(random_search.best_params_, "\n")
print("Mean accuracy = :", random_search.best_score_)

Los mejores parametros son:
{'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False} 

Mean accuracy = : 0.7190114068441065


In [8]:
bootstrap = [False]
max_depth = [None]
max_features = ["auto"]
min_samples_leaf = [1, 2, 4]
min_samples_split = [5, 10, 15]
n_estimators = [200]

param_grid = {
    'bootstrap': bootstrap,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}


# Modelo base

rfc = RandomForestClassifier(random_state=8)


# Cross validation

cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)


grid_search = GridSearchCV(estimator=rfc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)


grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   28.5s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_...
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=8,
                                              verbose=0, warm_start=False),
  

In [9]:
print("Los mejores parametros son: ")
print(grid_search.best_params_, "\n")
print("Mean accuracy = :", grid_search.best_score_)

Los mejores parametros son: 
{'bootstrap': False, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200} 

Mean accuracy = : 0.7043010752688172


In [10]:
best_rfc = random_search.best_estimator_

In [11]:
best_rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=20, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [14]:
y_pred = best_rfc.predict(X_test)

In [15]:
# Training accuracy
print("Training accuracy =", accuracy_score(y_train, best_rfc.predict(X_train)))

Training accuracy = 0.8973384030418251


In [16]:
print("Test accuracy =", accuracy_score(y_test, y_pred))

Test accuracy = 0.7203647416413373


In [17]:
report = classification_report(y_test, y_pred, output_dict=True)

In [20]:
df = pd.DataFrame(report).transpose()

In [21]:
df

Unnamed: 0,f1-score,precision,recall,support
0,0.771712,0.714943,0.838275,371.0
1,0.639216,0.730942,0.567944,287.0
accuracy,0.720365,0.720365,0.720365,0.720365
macro avg,0.705464,0.722942,0.70311,658.0
weighted avg,0.713921,0.721921,0.720365,658.0


In [22]:
base_model = RandomForestClassifier(random_state = 8)
base_model.fit(X_train, y_train)
accuracy_score(y_test, base_model.predict(X_test))



0.6990881458966566

In [23]:
best_rfc.fit(X_train, y_train)
accuracy_score(y_test, best_rfc.predict(X_test))

0.7203647416413373

In [24]:
with open("../Modelos/best_rfc.pickle", "wb") as output:
    pickle.dump(best_rfc, output)