Regression

In [2]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import pandas as pd

In [4]:
path_X_train = "../Pickle/X_train.pickle"
with open(path_X_train, "rb") as data:
    X_train = pickle.load(data)

path_y_train = "../Pickle/y_train.pickle"
with open(path_y_train, "rb") as data:
    y_train = pickle.load(data)

path_X_test = "../Pickle/X_test.pickle"
with open(path_X_test, "rb") as data:
    X_test = pickle.load(data)

path_y_test = "../Pickle/y_test.pickle"
with open(path_y_test, 'rb') as data:
    y_test = pickle.load(data)

In [5]:
# Extracción parametros para ver cuáles se pueden modificar

lr_params = LogisticRegression(random_state = 8)

print(lr_params.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': 8, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [6]:
# Parametros aleatorios para determinar con cuáles puede funcionar mejor el modelo
# C
C = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)]

# multi_class
multi_class = ['multinomial']

# solver
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
 
# class_weight
class_weight = ['balanced', None]

# penalty
penalty = ['l2']

# Create the random grid
random_grid = {'C': C,
               'multi_class': multi_class,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

print(random_grid)

{'C': [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0], 'multi_class': ['multinomial'], 'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'], 'class_weight': ['balanced', None], 'penalty': ['l2']}


In [7]:
# Modelo base
lrc = LogisticRegression(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=lrc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    7.7s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='warn', n_jobs=None,
                                                penalty='l2', random_state=8,
                                                solver='warn', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='warn', n_iter=50, n_jobs=None,
                   param_distributions={'C': [0.1, 0.2, 0.30000000000000004,
                                              0.4, 0.5, 0.6, 0.7000000000000001,
                                              0.8, 0.9, 1.0],
                                        'class_weight':

In [8]:
print("Los mejores parametros son:")
print(random_search.best_params_, "\n")
print("Mean accuracy = :", random_search.best_score_)


Los mejores parametros son:
{'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'multinomial', 'class_weight': None, 'C': 0.5} 

Mean accuracy = : 0.7167300380228137


In [9]:
# Create the parameter grid based on the results of random search 
C = [float(x) for x in np.linspace(start = 0.6, stop = 1, num = 10)]
multi_class = ["multinomial"]
solver = ["lbfgs"]
class_weight = [None]
penalty = ["l2"]

param_grid = {"C": C,
               "multi_class": multi_class,
               "solver": solver,
               "class_weight": class_weight,
               "penalty": penalty}


# Modelo base
lrc = LogisticRegression(random_state=8)


# Cross validation
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)


grid_search = GridSearchCV(estimator=lrc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)


grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.4s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=8, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_st...
             param_grid={'C': [0.6, 0.6444444444444444, 0.6888888888888889,
                               0.7333333333333333, 0.7777777777777778,
                               0.8222222222222222, 0.8666666666666667,
                               0.9111111111111111, 0.9555555555555555, 1.0],
            

In [10]:
print("Los mejores parametros son: ")
print(grid_search.best_params_, "\n")
print("Mean accuracy = :", grid_search.best_score_)

Los mejores parametros son: 
{'C': 0.6, 'class_weight': None, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'} 

Mean accuracy = : 0.6966205837173579


In [11]:
best_lrc = random_search.best_estimator_

In [12]:
best_lrc.fit(X_train, y_train)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=8, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
y_pred = best_lrc.predict(X_test)

In [14]:
# Training accuracy
print("Training accuracy =", accuracy_score(y_train, best_lrc.predict(X_train)))

Training accuracy = 0.7311787072243346


In [15]:
# Test accuracy
print("Test accuracy =", accuracy_score(y_test, y_pred))

Test accuracy = 0.7203647416413373


In [16]:
report = classification_report(y_test, y_pred, output_dict=True)

In [17]:
df = pd.DataFrame(report).transpose()

In [18]:
df

Unnamed: 0,f1-score,precision,recall,support
0,0.770574,0.716937,0.832884,371.0
1,0.642023,0.726872,0.574913,287.0
accuracy,0.720365,0.720365,0.720365,0.720365
macro avg,0.706298,0.721905,0.703898,658.0
weighted avg,0.714504,0.721271,0.720365,658.0


In [19]:
base_model = LogisticRegression(random_state = 8)
base_model.fit(X_train, y_train)
accuracy_score(y_test, base_model.predict(X_test))



0.7203647416413373

In [20]:
best_lrc.fit(X_train, y_train)
accuracy_score(y_test, best_lrc.predict(X_test))

0.7203647416413373

In [22]:
with open("../Modelos/best_lrc.pickle", "wb") as output:
    pickle.dump(best_lrc, output)