https://www.cienciadedatos.net/documentos/py17-regresion-logistica-python.html

In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
#from matplotlib import style
#import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import plot_confusion_matrix
import statsmodels.api as sm
#import statsmodels.formula.api as smf
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge

# Configuración matplotlib
# ==============================================================================
#plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
#plt.rcParams['savefig.bbox'] = "tight"
#style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
datos = pd.read_csv('rutUnicos_final.csv', sep=";")

In [3]:
print("Número de observaciones por clase")
print(datos['Gana_Gratuidad_final'].value_counts())
print("")

print("Porcentaje de observaciones por clase")
print(100 * datos['Gana_Gratuidad_final'].value_counts(normalize=True))

Número de observaciones por clase
0    1348934
1     654483
Name: Gana_Gratuidad_final, dtype: int64

Porcentaje de observaciones por clase
0    67.331664
1    32.668336
Name: Gana_Gratuidad_final, dtype: float64


In [4]:
# Reemplazo variables categoricas, utilizaremos LabelEncoder
#========================================================================
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, Normalizer
Columnas_Categoricas=['Mensualidad_Establecimiento_final', 'Region_Establecimiento_final','Segunda_Institucion_ES_final']
df_prep=datos
LabelsEnc = LabelEncoder()
for i in Columnas_Categoricas:
  df_prep[i] = LabelsEnc.fit_transform(datos[i])
datos=df_prep

In [7]:
# División de los datos en train y test PARA GRATUIDAD
# ==============================================================================
X = datos[['Anio_Nacimiento_final','Anios_Duracion_Carrera_Origen_final','Cantidad_asignaciones_final','Quintil_Final','Tipo_Carrera_Origen_cod']]
y = datos['Gana_Gratuidad_final']
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y.values.reshape(-1,1),
                                        train_size   = 0.8, #regla 80-20%
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [8]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo
X_train = sm.add_constant(X_train, prepend=True)
modelo = sm.Logit(endog=y_train, exog=X_train)
modelo = modelo.fit()
print(modelo.summary2())

Optimization terminated successfully.
         Current function value: 0.340347
         Iterations 7
                                   Results: Logit
Model:                    Logit                  Pseudo R-squared:       0.461       
Dependent Variable:       y                      AIC:                    1090983.8496
Date:                     2021-07-27 15:50       BIC:                    1091057.5729
No. Observations:         1602733                Log-Likelihood:         -5.4549e+05 
Df Model:                 5                      LL-Null:                -1.0129e+06 
Df Residuals:             1602727                LLR p-value:            0.0000      
Converged:                1.0000                 Scale:                  1.0000      
No. Iterations:           7.0000                                                     
-------------------------------------------------------------------------------------
                                     Coef.  Std.Err.     z     P>|z|   [0.

In [9]:
#modelo = modelo.fit()
print(modelo.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:              1602733
Model:                          Logit   Df Residuals:                  1602727
Method:                           MLE   Df Model:                            5
Date:                Tue, 27 Jul 2021   Pseudo R-squ.:                  0.4614
Time:                        15:50:07   Log-Likelihood:            -5.4549e+05
converged:                       True   LL-Null:                   -1.0129e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
const                                  -4.4134      0.013   -347.342      0.000      -4.438      -4.389
Anio_Nacimiento_final                   0.0022   6.75e-0

In [8]:
# Accuracy de test del modelo 
# ==============================================================================
X_test = sm.add_constant(X_test, prepend=True)
predicciones = modelo.predict(exog = X_test)
clasificacion = np.where(predicciones<0.5, 0, 1)
accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = clasificacion,
            normalize = True
           )
print(f"El accuracy de test es: {100*accuracy}%")

El accuracy de test es: 83.50595481726248%


In [9]:
# Error de test del modelo 
# ==============================================================================
rmse_ols = mean_squared_error(
            y_true  = y_test,
            y_pred  = clasificacion,
            squared = False
           )
print("")
print(f"El error (rmse) de test es: {rmse_ols}")
#El error (rmse) de test es: 0.5075303204590079


El error (rmse) de test es: 0.40612861488372765


In [10]:
modelo2 = LogisticRegression(fit_intercept=True, penalty='l2', tol=1e-5, C=0.8, solver='lbfgs', max_iter=60,warm_start=True)
#modelo = LogisticRegression()

In [11]:
modelo2.fit(X_train, y_train)
preds_train = modelo2.predict(X_train)
preds_test = modelo2.predict(X_test)
#lr.coef_
print('accuracy in train:', accuracy_score(preds_train, y_train))
print('accuracy in test:', accuracy_score(preds_test, y_test))

accuracy in train: 0.8206975210468618
accuracy in test: 0.8217473120963154


In [12]:
# Predicciones test
# ==============================================================================
predicciones = modelo2.predict(X=X_test)
predicciones = predicciones.flatten()
predicciones[:10]

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 0])

In [13]:
# Creación y entrenamiento del modelo (con búsqueda por CV del valor óptimo alpha)
# ==============================================================================
# Por defecto RidgeCV utiliza el mean squared error
modelo2 = RidgeCV(
            alphas          = np.logspace(-10, 2, 200),
            fit_intercept   = True,
            normalize       = True,
            store_cv_values = True
         )

_ = modelo2.fit(X = X_train, y = y_train)

In [14]:
print('Classification report of our model\n')
print(classification_report(preds_test, y_test))

Classification report of our model

              precision    recall  f1-score   support

           0       0.88      0.86      0.87    275810
           1       0.70      0.74      0.72    124874

    accuracy                           0.82    400684
   macro avg       0.79      0.80      0.79    400684
weighted avg       0.82      0.82      0.82    400684



In [15]:
# Error de test del modelo 
# ==============================================================================
rmse_ols = mean_squared_error(
            y_true  = y_test,
            y_pred  = predicciones,
            squared = False
           )
print("")
print(f"El error (rmse) de test es: {rmse_ols}")


El error (rmse) de test es: 0.4221998198764238


In [16]:
coefs = pd.DataFrame(modelo2.coef_, columns=X_train.columns).T
coefs

Unnamed: 0,0
const,0.0
Anio_Nacimiento_final,0.000248
Anios_Duracion_Carrera_Origen_final,0.116762
Cantidad_asignaciones_final,0.124548
Quintil_Final,-0.090732
Tipo_Carrera_Origen_cod,-0.257085


In [17]:
np.exp(coefs[0])

const                                  1.000000
Anio_Nacimiento_final                  1.000248
Anios_Duracion_Carrera_Origen_final    1.123852
Cantidad_asignaciones_final            1.132637
Quintil_Final                          0.913262
Tipo_Carrera_Origen_cod                0.773303
Name: 0, dtype: float64

In [18]:
modelo2.intercept_

array([-0.16353548])

# Calculo para Cambio de carrera

In [19]:
datos = pd.read_csv('DatosGuardados.csv', sep=",")

In [20]:
print("Número de observaciones por clase")
print(datos['Cambia_IES_final'].value_counts())
print("")

print("Porcentaje de observaciones por clase")
print(100 * datos['Cambia_IES_final'].value_counts(normalize=True))

Número de observaciones por clase
0    559563
1     94571
Name: Cambia_IES_final, dtype: int64

Porcentaje de observaciones por clase
0    85.542565
1    14.457435
Name: Cambia_IES_final, dtype: float64


In [21]:
# División de los datos en train y test PARA GRATUIDAD
# ==============================================================================
X = datos[['Anios_Estudiando_Carrera_Origen_final','Anios_Estudiando_ES_final','Cantidad_Postulaciones_final']]
y = datos['Cambia_IES_final']
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y.values.reshape(-1,1),
                                        train_size   = 0.8, #regla 80-20%
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [22]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo
X_train = sm.add_constant(X_train, prepend=True)
modelo = sm.Logit(endog=y_train, exog=X_train)
modelo = modelo.fit()
print(modelo.summary2())

Optimization terminated successfully.
         Current function value: 0.299974
         Iterations 7
                                    Results: Logit
Model:                     Logit                   Pseudo R-squared:        0.274      
Dependent Variable:        y                       AIC:                     313965.4180
Date:                      2021-07-26 20:12        BIC:                     314010.0897
No. Observations:          523307                  Log-Likelihood:          -1.5698e+05
Df Model:                  3                       LL-Null:                 -2.1632e+05
Df Residuals:              523303                  LLR p-value:             0.0000     
Converged:                 1.0000                  Scale:                   1.0000     
No. Iterations:            7.0000                                                      
---------------------------------------------------------------------------------------
                                       Coef.  Std.Err. 

In [23]:
#modelo = modelo.fit()
print(modelo.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               523307
Model:                          Logit   Df Residuals:                   523303
Method:                           MLE   Df Model:                            3
Date:                Mon, 26 Jul 2021   Pseudo R-squ.:                  0.2743
Time:                        20:12:58   Log-Likelihood:            -1.5698e+05
converged:                       True   LL-Null:                   -2.1632e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
const                                    -4.2541      0.015   -292.139      0.000      -4.283      -4.226
Anios_Estudiando_Carrera_Origen_final    -0.9676  

In [24]:
# Accuracy de test del modelo 
# ==============================================================================
X_test = sm.add_constant(X_test, prepend=True)
predicciones = modelo.predict(exog = X_test)
clasificacion = np.where(predicciones<0.5, 0, 1)
accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = clasificacion,
            normalize = True
           )
print(f"El accuracy de test es: {100*accuracy}%")

El accuracy de test es: 86.4546309248091%


In [25]:
# Error de test del modelo 
# ==============================================================================
rmse_ols = mean_squared_error(
            y_true  = y_test,
            y_pred  = clasificacion,
            squared = False
           )
print("")
print(f"El error (rmse) de test es: {rmse_ols}")
#El error (rmse) de test es: 0.5075303204590079


El error (rmse) de test es: 0.36804033848466805


In [26]:
modelo2 = LogisticRegression(fit_intercept=True, penalty='l2', tol=1e-5, C=0.8, solver='lbfgs', max_iter=60,warm_start=True)
#modelo = LogisticRegression()

In [27]:
modelo2.fit(X_train, y_train)
preds_train = modelo2.predict(X_train)
preds_test = modelo2.predict(X_test)
#lr.coef_
print('accuracy in train:', accuracy_score(preds_train, y_train))
print('accuracy in test:', accuracy_score(preds_test, y_test))

accuracy in train: 0.8636574706625365
accuracy in test: 0.864546309248091


In [28]:
# Predicciones test
# ==============================================================================
predicciones = modelo2.predict(X=X_test)
predicciones = predicciones.flatten()
predicciones[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [29]:
# Creación y entrenamiento del modelo (con búsqueda por CV del valor óptimo alpha)
# ==============================================================================
# Por defecto RidgeCV utiliza el mean squared error
modelo2 = RidgeCV(
            alphas          = np.logspace(-10, 2, 200),
            fit_intercept   = True,
            normalize       = True,
            store_cv_values = True
         )

_ = modelo2.fit(X = X_train, y = y_train)

In [30]:
print('Classification report of our model\n')
print(classification_report(preds_test, y_test))

Classification report of our model

              precision    recall  f1-score   support

           0       0.96      0.89      0.92    119791
           1       0.32      0.55      0.41     11036

    accuracy                           0.86    130827
   macro avg       0.64      0.72      0.67    130827
weighted avg       0.90      0.86      0.88    130827



In [31]:
# Error de test del modelo 
# ==============================================================================
rmse_ols = mean_squared_error(
            y_true  = y_test,
            y_pred  = predicciones,
            squared = False
           )
print("")
print(f"El error (rmse) de test es: {rmse_ols}")


El error (rmse) de test es: 0.36804033848466805


In [32]:
coefs = pd.DataFrame(modelo2.coef_, columns=X_train.columns).T
coefs

Unnamed: 0,0
const,0.0
Anios_Estudiando_Carrera_Origen_final,-0.147709
Anios_Estudiando_ES_final,0.176975
Cantidad_Postulaciones_final,0.052679


In [33]:
np.exp(coefs[0])

const                                    1.000000
Anios_Estudiando_Carrera_Origen_final    0.862682
Anios_Estudiando_ES_final                1.193601
Cantidad_Postulaciones_final             1.054092
Name: 0, dtype: float64

In [34]:
modelo2.intercept_

array([-0.12810989])

In [35]:
#!pip install session_info
import session_info
session_info.show()