In [1022]:
import pandas as pd
import datetime
import warnings
import os
import statsmodels.formula.api as smf
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# https://www.andrewvillazon.com/logistic-regression-python-statsmodels/#examining-fit-results

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [1023]:
data_train = pd.read_parquet('../data/processed/train.gzip')
data_sub = pd.read_parquet('../data/processed/test.gzip')

data_train.columns = [col.replace('-', '_') for col in data_train.columns]
data_sub.columns = [col.replace('-', '_') for col in data_sub.columns]

# list(data_train.columns)

In [1024]:
data_sub_id = data_sub[['ID']]

categorical_feats = ['CO_TIPO_SEXO', 'NO_DEPARTAMENTO', 'REGION']
for col in categorical_feats:
    data_train[col] = data_train[col].astype('category')
    data_sub[col] = data_sub[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5
N_JOBS = os.cpu_count() // 2.5

In [1025]:
exclude_substr_list = [
    # 'CANT_', 'VAR_', 'DIFF_', 
    # 'min', 'max', 'first', 'last', 'median', 
    # 'MED_EMP', 'PEQ_EMP', 'MIC_EMP', 
    # 'REV', 'HIPOT', 'VENCIDO', 'DOLA', 
    # 'ANIO_BANCARIZACION', 'MES_BANCARIZACION', 'NO_DEPARTAMENTO', 'SALDO_EMP_TOTAL_mean'
    ]

selected_cols = [
    'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean',
    'DIFF_SALDO_EMP_TOTAL_MA3_mean',
    'SALDO_EMP_TOTAL_min',
    'SALDO_EMP_TOTAL_max',
    'SALDO_DOLA_CONS_mean',
    'CANT_EMP_NEG_min',
    'SALDO_CONS_REV_mean',
    # 'SALDO_HIPOT_mean',
    'SALDO_VENCIDO_mean',
    'DIFF_SALDO_EMP_TOTAL_MA8_mean', 
    'MAX_LINEA_DISP_U6M_max', 
    'CO_TIPO_SEXO',
    'MESES_HASTA_ACTUAL',
    'EDAD',
    'REGION', TARGET_VAR
]
data_train =  data_train.select_dtypes(include=['int64', 'float64', 'category'])
# data_train =  data_train[[col for col in data_train.columns if all([sub not in col for sub in exclude_substr_list])]]
data_train = data_train[selected_cols]

data_sub = data_sub.select_dtypes(include=['int64', 'float64', 'category'])
# data_sub = data_sub[[col for col in data_sub.columns if all([sub not in col for sub in exclude_substr_list])]]
data_sub = data_sub[selected_cols]



list(data_train.columns)
print(data_train.dtypes)

DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean     float64
DIFF_SALDO_EMP_TOTAL_MA3_mean           float64
SALDO_EMP_TOTAL_min                     float64
SALDO_EMP_TOTAL_max                     float64
SALDO_DOLA_CONS_mean                    float64
CANT_EMP_NEG_min                          int64
SALDO_CONS_REV_mean                     float64
SALDO_VENCIDO_mean                      float64
DIFF_SALDO_EMP_TOTAL_MA8_mean           float64
MAX_LINEA_DISP_U6M_max                  float64
CO_TIPO_SEXO                           category
MESES_HASTA_ACTUAL                        int64
EDAD                                    float64
REGION                                 category
TARGET                                    int64
dtype: object


In [1026]:
# calculate variance of each column
variances = data_train.var()
print(variances)

DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean     20579795.458481
DIFF_SALDO_EMP_TOTAL_MA3_mean           15828319.113778
SALDO_EMP_TOTAL_min                   2993197231.396115
SALDO_EMP_TOTAL_max                   5406019892.664605
SALDO_DOLA_CONS_mean                   131356914.312455
CANT_EMP_NEG_min                               0.537532
SALDO_CONS_REV_mean                     45676551.088203
SALDO_VENCIDO_mean                     159413210.498562
DIFF_SALDO_EMP_TOTAL_MA8_mean           15315674.636819
MAX_LINEA_DISP_U6M_max                1412262998.576963
MESES_HASTA_ACTUAL                          3543.849054
EDAD                                         150.927726
TARGET                                         0.127506
dtype: float64


In [1027]:
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import KNNImputer
# from sklearn.impute import IterativeImputer
# from sklearn.preprocessing import LabelEncoder
# from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


# df_train['REGION'] = df_train['NO_DEPARTAMENTO'].apply(asignar_region)
# df_test['REGION'] = df_test['NO_DEPARTAMENTO'].apply(asignar_region)
# data_sub['REGION'] = data_sub['NO_DEPARTAMENTO'].apply(asignar_region)

In [1028]:
# for region in data_train['REGION'].unique():
#     print(region)
#     print(data_train[data_train['REGION'] == region]['TARGET'].value_counts(normalize=True))

In [1050]:
import patsy
models = {}

data_copy = data_train.copy()

for region in data_train['REGION'].unique():
    print(f'\t\t{region.upper()}')
    sub_df = data_train[data_train['REGION'] == region].drop(columns=['REGION'])
    X_train, X_test, y_train, y_test = train_test_split(sub_df.drop(columns=[TARGET_VAR]), sub_df[TARGET_VAR], test_size=0.2, random_state=42, stratify=sub_df[TARGET_VAR])

    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    # print(df_train.shape, df_test.shape)
    independents = ' + '.join([
        col for col in df_train.columns if col != TARGET_VAR
    ])
    print(independents)
    
    # log_reg = LogisticRegression(random_state=42, max_iter=1000).fit(df_train.drop(columns=[TARGET_VAR]), df_train[TARGET_VAR])
    log_reg = smf.logit(f"TARGET ~ {independents}", data=df_train).fit_regularized()
    models[region] = log_reg
    print(log_reg.summary())
    FACTOR = 0.25
    data_copy.loc[data_copy['REGION'] == region, 'PRED'] = log_reg.predict(df_train.drop(columns=[TARGET_VAR]))
    data_copy.loc[data_copy['REGION'] == region, 'PRED'] = (data_copy.loc[data_copy['REGION'] == region, 'PRED'] > FACTOR).astype(int)

    data_sub.loc[data_sub['REGION'] == region, 'TARGET'] = log_reg.predict(data_sub[data_sub['REGION'] == region].drop(columns=[TARGET_VAR]))
    data_sub.loc[data_sub['REGION'] == region, 'TARGET'] = (data_sub.loc[data_sub['REGION'] == region, 'TARGET'] > FACTOR).astype(int)

print(classification_report(data_copy[TARGET_VAR], data_copy['PRED']))
# print(classification_report(data_train[TARGET_VAR], y_pred))
    # print(log_reg.summary())
    # y_pred = log_reg.predict(df_train.drop(columns=[TARGET_VAR]))
    # y_pred = (y_pred > 0.115).astype(int)
    # print(classification_report(df_train[TARGET_VAR], y_pred))

    # y_pred = log_reg.predict(df_test.drop(columns=[TARGET_VAR]))
    # y_pred = (y_pred > 0.115).astype(int)
    # print(classification_report(df_test[TARGET_VAR], y_pred))


		CENTRO
DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean + DIFF_SALDO_EMP_TOTAL_MA3_mean + SALDO_EMP_TOTAL_min + SALDO_EMP_TOTAL_max + SALDO_DOLA_CONS_mean + CANT_EMP_NEG_min + SALDO_CONS_REV_mean + SALDO_VENCIDO_mean + DIFF_SALDO_EMP_TOTAL_MA8_mean + MAX_LINEA_DISP_U6M_max + CO_TIPO_SEXO + MESES_HASTA_ACTUAL + EDAD
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3365793581745955
            Iterations: 55
            Function evaluations: 120
            Gradient evaluations: 55
                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:                22371
Model:                          Logit   Df Residuals:                    22356
Method:                           MLE   Df Model:                           14
Date:                Sat, 18 Nov 2023   Pseudo R-squ.:                  0.1347
Time:                        21:08:43   Log-Likelihood:                -7529.6
converged

In [1055]:
submission = pd.concat([data_sub_id, data_sub['TARGET']], axis=1)
print(submission['TARGET'].value_counts(normalize=True))
submission.to_csv('../data/results/logit.csv', index=False)

0.000000   0.873497
1.000000   0.126503
Name: TARGET, dtype: float64


In [1052]:
# odds_ratios = pd.DataFrame(
#     {
#         "OR": log_reg.params,
#         "Lower CI": log_reg.conf_int()[0],
#         "Upper CI": log_reg.conf_int()[1],
#     }
# )
# odds_ratios = np.exp(odds_ratios)

# print(odds_ratios)

In [1053]:
# print(log_reg.params)