In [172]:
import pandas as pd
import datetime
import warnings
import os
import statsmodels.formula.api as smf
import numpy as np
import seaborn as sns

# https://www.andrewvillazon.com/logistic-regression-python-statsmodels/#examining-fit-results

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [173]:
data_train = pd.read_parquet('../data/processed/train.gzip')
data_sub = pd.read_parquet('../data/processed/test.gzip')

list(data_train.columns)

['ID',
 'TARGET',
 'CO_TIPO_SEXO',
 'EDAD',
 'NO_DEPARTAMENTO',
 'ANIO_BANCARIZACION',
 'MES_BANCARIZACION',
 'CANT_EMP_NEG_max',
 'CANT_EMP_NEG_last',
 'CANT_EMP_NEG_min',
 'CANT_EMP_CONS_max',
 'CANT_EMP_CONS_last',
 'CANT_EMP_CONS_min',
 'CANT_EMP_HIPOT_max',
 'CANT_EMP_HIPOT_last',
 'CANT_EMP_HIPOT_min',
 'SALDO_MED_EMP_mean',
 'SALDO_MED_EMP_median',
 'SALDO_MED_EMP_last',
 'SALDO_MED_EMP_min',
 'SALDO_MED_EMP_max',
 'SALDO_MED_EMP_first',
 'SALDO_PEQ_EMP_mean',
 'SALDO_PEQ_EMP_median',
 'SALDO_PEQ_EMP_last',
 'SALDO_PEQ_EMP_min',
 'SALDO_PEQ_EMP_max',
 'SALDO_PEQ_EMP_first',
 'SALDO_MIC_EMP_mean',
 'SALDO_MIC_EMP_median',
 'SALDO_MIC_EMP_last',
 'SALDO_MIC_EMP_min',
 'SALDO_MIC_EMP_max',
 'SALDO_MIC_EMP_first',
 'SALDO_CONS_REV_mean',
 'SALDO_CONS_REV_median',
 'SALDO_CONS_REV_last',
 'SALDO_CONS_REV_min',
 'SALDO_CONS_REV_max',
 'SALDO_CONS_REV_first',
 'SALDO_CONS_NO_REV_mean',
 'SALDO_CONS_NO_REV_median',
 'SALDO_CONS_NO_REV_last',
 'SALDO_CONS_NO_REV_min',
 'SALDO_CONS_NO_REV

In [174]:
data_sub_id = data_sub[['ID']]

categorical_feats = ['CO_TIPO_SEXO', 'NO_DEPARTAMENTO', 'ANIO_BANCARIZACION', 'MES_BANCARIZACION']
for col in categorical_feats:
    data_train[col] = data_train[col].astype('category')
    data_sub[col] = data_sub[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5
N_JOBS = os.cpu_count() // 2.5

In [175]:
exclude_substr_list = ['CANT_', '_max', 'VAR_', 'DIFF_', 'min', 'max', 'first', 'last', 'median', 'MED_EMP', 'PEQ_EMP', 'MIC_EMP', 'REV', 'HIPOT', 'VENCIDO', 'DOLA_NEG', 'EDAD']  # 'TOTAL', 

data_train =  data_train.select_dtypes(include=['int64', 'float64'])
data_train =  data_train[[col for col in data_train.columns if all([sub not in col for sub in exclude_substr_list])]]

data_sub = data_sub.select_dtypes(include=['int64', 'float64'])
data_sub = data_sub[[col for col in data_sub.columns if all([sub not in col for sub in exclude_substr_list])]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(columns=[TARGET_VAR]), data_train[TARGET_VAR], test_size=0.3, random_state=42)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

# display(df_train)
# display(df_test)
list(df_train.columns)

['SALDO_DOLA_CONS_mean',
 'MAX_LINEA_DISP_U6M_mean',
 'SALDO_EMP_TOTAL_mean',
 'TARGET']

In [176]:
list(data_train.columns)

['TARGET',
 'SALDO_DOLA_CONS_mean',
 'MAX_LINEA_DISP_U6M_mean',
 'SALDO_EMP_TOTAL_mean']

In [177]:
independents = ' + '.join([col for col in data_train.columns if col != TARGET_VAR])

log_reg = smf.logit(f"TARGET ~ {independents}", data=df_train).fit()

print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.383715
         Iterations 9


                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:                63828
Model:                          Logit   Df Residuals:                    63824
Method:                           MLE   Df Model:                            3
Date:                Fri, 17 Nov 2023   Pseudo R-squ.:                 0.09171
Time:                        17:53:25   Log-Likelihood:                -24492.
converged:                       True   LL-Null:                       -26965.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                  -2.0971      0.014   -153.634      0.000      -2.124      -2.070
SALDO_DOLA_CONS_mean     3.924e-05   1.62e-05      2.423      0.015     7.5e-06     7.1e-05


In [178]:
# do classification report
from sklearn.metrics import classification_report

y_pred = log_reg.predict(df_train.drop(columns=[TARGET_VAR]))
y_pred = (y_pred > 0.5).astype(int)
print(classification_report(df_train[TARGET_VAR], y_pred))


y_pred = log_reg.predict(df_test.drop(columns=[TARGET_VAR]))
y_pred = (y_pred > 0.5).astype(int)
print(classification_report(df_test[TARGET_VAR], y_pred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.92     54263
           1       0.67      0.13      0.21      9565

    accuracy                           0.86     63828
   macro avg       0.77      0.56      0.57     63828
weighted avg       0.84      0.86      0.82     63828

              precision    recall  f1-score   support

           0       0.87      0.99      0.92     23242
           1       0.65      0.13      0.22      4113

    accuracy                           0.86     27355
   macro avg       0.76      0.56      0.57     27355
weighted avg       0.83      0.86      0.82     27355



In [179]:
print(dir(log_reg))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cache', '_data_attr', '_data_in_cache', '_get_endog_name', '_get_robustcov_results', '_get_wald_nonlinear', '_transform_predict_exog', '_use_t', 'aic', 'bic', 'bse', 'conf_int', 'converged', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'f_test', 'fittedvalues', 'get_distribution', 'get_influence', 'get_margeff', 'get_prediction', 'im_ratio', 'info_criteria', 'initialize', 'k_constant', 'llf', 'llnull', 'llr', 'llr_pvalue', 'load', 'method', 'mle_retvals', 'mle_settings', 'model', 'nobs', 'normalized_cov_params', 'params', 'pred_table', 'predict', 'prsquared', 'pvalues', 'remove_data', 'resid_dev', 'resid_generali

In [180]:
odds_ratios = pd.DataFrame(
    {
        "OR": log_reg.params,
        "Lower CI": log_reg.conf_int()[0],
        "Upper CI": log_reg.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)

print(odds_ratios)

                              OR  Lower CI  Upper CI
Intercept               0.122815  0.119573  0.126145
SALDO_DOLA_CONS_mean    1.000039  1.000007  1.000071
MAX_LINEA_DISP_U6M_mean 1.000020  1.000020  1.000021
SALDO_EMP_TOTAL_mean    1.000003  1.000002  1.000003


In [181]:
print(log_reg.params)

Intercept                 -2.097074
SALDO_DOLA_CONS_mean       0.000039
MAX_LINEA_DISP_U6M_mean    0.000020
SALDO_EMP_TOTAL_mean       0.000003
dtype: float64
