In [44]:
import pandas as pd
import datetime
import warnings
import os
import statsmodels.formula.api as smf
import numpy as np
import seaborn as sns

# https://www.andrewvillazon.com/logistic-regression-python-statsmodels/#examining-fit-results

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [45]:
data_train = pd.read_parquet('../data/processed/train.gzip')
data_sub = pd.read_parquet('../data/processed/test.gzip')

data_train.columns = [col.replace('-', '_') for col in data_train.columns]
data_sub.columns = [col.replace('-', '_') for col in data_sub.columns]

list(data_train.columns)

['ID',
 'TARGET',
 'CO_TIPO_SEXO',
 'EDAD',
 'NO_DEPARTAMENTO',
 'ANIO_BANCARIZACION',
 'MES_BANCARIZACION',
 'CANT_EMP_NEG_max',
 'CANT_EMP_NEG_last',
 'CANT_EMP_NEG_min',
 'CANT_EMP_CONS_max',
 'CANT_EMP_CONS_last',
 'CANT_EMP_CONS_min',
 'CANT_EMP_HIPOT_max',
 'CANT_EMP_HIPOT_last',
 'CANT_EMP_HIPOT_min',
 'SALDO_MED_EMP_mean',
 'SALDO_MED_EMP_median',
 'SALDO_MED_EMP_last',
 'SALDO_MED_EMP_min',
 'SALDO_MED_EMP_max',
 'SALDO_MED_EMP_first',
 'SALDO_PEQ_EMP_mean',
 'SALDO_PEQ_EMP_median',
 'SALDO_PEQ_EMP_last',
 'SALDO_PEQ_EMP_min',
 'SALDO_PEQ_EMP_max',
 'SALDO_PEQ_EMP_first',
 'SALDO_MIC_EMP_mean',
 'SALDO_MIC_EMP_median',
 'SALDO_MIC_EMP_last',
 'SALDO_MIC_EMP_min',
 'SALDO_MIC_EMP_max',
 'SALDO_MIC_EMP_first',
 'SALDO_CONS_REV_mean',
 'SALDO_CONS_REV_median',
 'SALDO_CONS_REV_last',
 'SALDO_CONS_REV_min',
 'SALDO_CONS_REV_max',
 'SALDO_CONS_REV_first',
 'SALDO_CONS_NO_REV_mean',
 'SALDO_CONS_NO_REV_median',
 'SALDO_CONS_NO_REV_last',
 'SALDO_CONS_NO_REV_min',
 'SALDO_CONS_NO_REV

In [46]:
zona_departamento = {
    'Norte': ['LAMBAYEQUE', 'CAJAMARCA', 'PIURA', 'TUMBES', 'LA LIBERTAD', 'AMAZONAS', 'SAN MARTIN'],
    'Centro': ['JUNIN', 'ANCASH', 'HUANUCO', 'HUANCAVELICA', 'PASCO', 'AYACUCHO', 'UCAYALI'],
    'Sur': ['AREQUIPA', 'MOQUEGUA', 'TACNA', 'CUSCO', 'PUNO', 'MADRE DE DIOS', 'ICA', 'APURIMAC', 'LIMA', 'CALLAO']
}

departamento_a_region = { v: k for k, values in zona_departamento.items() for v in values}

def asignar_region(departamento):
    return departamento_a_region.get(departamento, 'DESCONOCIDO')


In [47]:
data_sub_id = data_sub[['ID']]

categorical_feats = ['CO_TIPO_SEXO', 'NO_DEPARTAMENTO']
for col in categorical_feats:
    data_train[col] = data_train[col].astype('category')
    data_sub[col] = data_sub[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5
N_JOBS = os.cpu_count() // 2.5

In [48]:
exclude_substr_list = [
    'CANT_', 'VAR_', 'DIFF_', 
    'min', 'max', 'first', 'last', 'median', 
    'MED_EMP', 'PEQ_EMP', 'MIC_EMP', 
    'REV', 'HIPOT', 'VENCIDO', 'DOLA_NEG', 
    'EDAD', 'ANIO_BANCARIZACION', 'MES_BANCARIZACION'
    ]

data_train =  data_train.select_dtypes(include=['int64', 'float64', 'category'])
data_train =  data_train[[col for col in data_train.columns if all([sub not in col for sub in exclude_substr_list])]]

data_sub = data_sub.select_dtypes(include=['int64', 'float64', 'category'])
data_sub = data_sub[[col for col in data_sub.columns if all([sub not in col for sub in exclude_substr_list])]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(columns=[TARGET_VAR]), data_train[TARGET_VAR], test_size=0.3, random_state=42)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

# display(df_train)
# display(df_test)


list(df_train.columns)

['CO_TIPO_SEXO',
 'NO_DEPARTAMENTO',
 'SALDO_DOLA_CONS_mean',
 'MAX_LINEA_DISP_U6M_mean',
 'SALDO_EMP_TOTAL_mean',
 'TARGET']

In [49]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


df_train['REGION'] = df_train['NO_DEPARTAMENTO'].apply(asignar_region)
df_test['REGION'] = df_test['NO_DEPARTAMENTO'].apply(asignar_region)
data_sub['REGION'] = data_sub['NO_DEPARTAMENTO'].apply(asignar_region)

In [50]:
df_train['REGION'].value_counts()

DESCONOCIDO    32157
Norte          14617
Centro          9166
Sur             7888
Name: REGION, dtype: int64

In [51]:
df_train.head()

Unnamed: 0,CO_TIPO_SEXO,NO_DEPARTAMENTO,SALDO_DOLA_CONS_mean,MAX_LINEA_DISP_U6M_mean,SALDO_EMP_TOTAL_mean,TARGET,REGION
46914,2,LAMBAYEQUE,1.009599,8798.533333,1596.1068,0,Norte
14038,2,LAMBAYEQUE,0.0,0.0,104388.659156,0,Norte
19138,2,CAJAMARCA,0.0,0.0,0.0,0,Norte
3989,2,LAMBAYEQUE,0.0,5100.944444,0.0,0,Norte
18716,3,,0.0,4546.666667,0.0,0,DESCONOCIDO


In [52]:
independents = ' + '.join([
    col for col in df_train.columns if col != TARGET_VAR
]) # [col for col in data_train.columns if col != TARGET_VAR]

log_reg = smf.logit(f"TARGET ~ {independents}", data=df_train, method='bfgs').fit_regularized()

print(log_reg.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.37388972245731766
            Iterations: 209
            Function evaluations: 236
            Gradient evaluations: 209
                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:                63828
Model:                          Logit   Df Residuals:                    63794
Method:                           MLE   Df Model:                           33
Date:                Sat, 18 Nov 2023   Pseudo R-squ.:                  0.1150
Time:                        13:56:52   Log-Likelihood:                -23865.
converged:                       True   LL-Null:                       -26965.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------

In [53]:
# do classification report
from sklearn.metrics import classification_report

y_pred = log_reg.predict(df_train.drop(columns=[TARGET_VAR]))
y_pred = (y_pred > 0.2).astype(int)
print(classification_report(df_train[TARGET_VAR], y_pred))


y_pred = log_reg.predict(df_test.drop(columns=[TARGET_VAR]))
y_pred = (y_pred > 0.2).astype(int)
print(classification_report(df_test[TARGET_VAR], y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89     54263
           1       0.40      0.41      0.40      9565

    accuracy                           0.82     63828
   macro avg       0.65      0.65      0.65     63828
weighted avg       0.82      0.82      0.82     63828

              precision    recall  f1-score   support

           0       0.90      0.89      0.89     23242
           1       0.40      0.42      0.41      4113

    accuracy                           0.82     27355
   macro avg       0.65      0.65      0.65     27355
weighted avg       0.82      0.82      0.82     27355



In [54]:
print(dir(log_reg))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cache', '_data_attr', '_data_in_cache', '_get_endog_name', '_get_robustcov_results', '_get_wald_nonlinear', '_transform_predict_exog', '_use_t', 'aic', 'bic', 'bse', 'conf_int', 'converged', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'f_test', 'fittedvalues', 'get_distribution', 'get_influence', 'get_margeff', 'get_prediction', 'im_ratio', 'info_criteria', 'initialize', 'k_constant', 'llf', 'llnull', 'llr', 'llr_pvalue', 'load', 'method', 'mle_retvals', 'mle_settings', 'model', 'nnz_params', 'nobs', 'normalized_cov_params', 'params', 'pred_table', 'predict', 'prsquared', 'pvalues', 'remove_data', 'resid_dev', '

In [55]:
odds_ratios = pd.DataFrame(
    {
        "OR": log_reg.params,
        "Lower CI": log_reg.conf_int()[0],
        "Upper CI": log_reg.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)

print(odds_ratios)

                                       OR  Lower CI  Upper CI
Intercept                        0.095712       NaN       NaN
CO_TIPO_SEXO[T.2]                1.216985  1.153202  1.284296
CO_TIPO_SEXO[T.3]                0.613872  0.576042  0.654188
NO_DEPARTAMENTO[T.ANCASH]        1.454510       NaN       NaN
NO_DEPARTAMENTO[T.APURIMAC]      1.004178  0.000000       inf
NO_DEPARTAMENTO[T.AREQUIPA]      0.918071  0.000000       inf
NO_DEPARTAMENTO[T.AYACUCHO]      0.527220       NaN       NaN
NO_DEPARTAMENTO[T.CAJAMARCA]     1.326039  0.900463  1.952750
NO_DEPARTAMENTO[T.CALLAO]        1.031637  0.000000       inf
NO_DEPARTAMENTO[T.CUSCO]         0.734552  0.000000       inf
NO_DEPARTAMENTO[T.HUANCAVELICA]  1.199603       NaN       NaN
NO_DEPARTAMENTO[T.HUANUCO]       1.012692       NaN       NaN
NO_DEPARTAMENTO[T.ICA]           1.289610  0.000000       inf
NO_DEPARTAMENTO[T.JUNIN]         1.135406       NaN       NaN
NO_DEPARTAMENTO[T.LA LIBERTAD]   1.242486  0.829967  1.860039
NO_DEPAR

In [56]:
print(log_reg.params)

Intercept                          -2.346416
CO_TIPO_SEXO[T.2]                   0.196377
CO_TIPO_SEXO[T.3]                  -0.487968
NO_DEPARTAMENTO[T.ANCASH]           0.374669
NO_DEPARTAMENTO[T.APURIMAC]         0.004170
NO_DEPARTAMENTO[T.AREQUIPA]        -0.085480
NO_DEPARTAMENTO[T.AYACUCHO]        -0.640138
NO_DEPARTAMENTO[T.CAJAMARCA]        0.282196
NO_DEPARTAMENTO[T.CALLAO]           0.031146
NO_DEPARTAMENTO[T.CUSCO]           -0.308495
NO_DEPARTAMENTO[T.HUANCAVELICA]     0.181991
NO_DEPARTAMENTO[T.HUANUCO]          0.012612
NO_DEPARTAMENTO[T.ICA]              0.254340
NO_DEPARTAMENTO[T.JUNIN]            0.126990
NO_DEPARTAMENTO[T.LA LIBERTAD]      0.217114
NO_DEPARTAMENTO[T.LAMBAYEQUE]       1.285471
NO_DEPARTAMENTO[T.LIMA]            -0.196062
NO_DEPARTAMENTO[T.LORETO]           0.051772
NO_DEPARTAMENTO[T.MADRE DE DIOS]   -0.073784
NO_DEPARTAMENTO[T.MOQUEGUA]         0.529384
NO_DEPARTAMENTO[T.PASCO]           -0.426987
NO_DEPARTAMENTO[T.PIURA]            0.207776
NO_DEPARTA