In [1]:
import json
import os

import joblib
import numpy as np
import pandas as pd

from dotenv import load_dotenv
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier

from core_ds4a_project import datasets

%load_ext autoreload
%autoreload 1
%aimport core_ds4a_project, core_ds4a_project.datasets

Environment variables:

In [2]:
load_dotenv('envvars')

ROOT_DATA_PATH = os.environ.get('ROOT_DATA_PATH')
RAW_DATA_PATH = os.environ.get('RAW_DATA_PATH') or f'{ROOT_DATA_PATH}/raw'
CLEAN_DATA_PATH = os.environ.get('CLEAN_DATA_PATH') or f'{ROOT_DATA_PATH}/clean'

Reading data:

In [3]:
(cartera_df, clientes_df, colocacion_df) = datasets.read_joining_datasets(dir_path=RAW_DATA_PATH)
# model_df = pd.read_csv(os.path.join(CLEAN_DATA_PATH, 'model_defaulting.csv'))
# model_df['DEFAULT'] = model_df['DEFAULT'].fillna(False).astype(int)
# model_df['DEFAULT'].value_counts(dropna=False)

Sorting CARTERA dataset:

In [4]:
cartera_df = cartera_df.sort_values(['FECHA_CIERRE', 'OBLIGACION'], ascending=True)
ind_first = ~(cartera_df.duplicated(keep='first', subset='OBLIGACION'))
ind_last = ~(cartera_df.duplicated(keep='last', subset='OBLIGACION'))

cartera_df = cartera_df.sort_values(['FECHA_CIERRE', 'OBLIGACION'], ascending=True)
ind_first = ~(cartera_df.duplicated(keep='first', subset='OBLIGACION'))
ind_last = ~(cartera_df.duplicated(keep='last', subset='OBLIGACION'))

Composing dataframe for modeling:

In [5]:
cartera_last_df = cartera_df[ind_last].copy()
cartera_last_df.shape

(38641, 32)

In [6]:
model_df = (
    cartera_last_df
    .merge(
        colocacion_df.drop(columns=['CLIENTE', 'VALOR_CUOTA']),
        on='OBLIGACION',
    )
    .merge(clientes_df, on='CLIENTE', how='left')
)

model_df['DEFAULT'] = model_df['DEFAULT'].fillna(False).astype(int)

covariates = ['ESTADO_CIVIL_COD', 'TIPO_VIVIENDA', 'MUNICIPIO_CLIENTE', 'TIPO_UBICACION_COD', 'MUJER_CABEZA',
              'RESPONSABLE_DE_HOGAR', 'GENERO_COD', 'ESTRATO', 'NIVEL_ESTUDIOS_COD']

model_df = model_df[['DEFAULT', *covariates]]
model_df.shape

(38641, 10)

In [7]:
model_df['DEFAULT'].value_counts(dropna=False)

0    37678
1      963
Name: DEFAULT, dtype: int64

In [8]:
model_df[covariates] = model_df[covariates].astype(str)
categories = [sorted(model_df[col].unique()) for col in covariates]

Building model:

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    model_df[covariates],
    model_df['DEFAULT'],
    test_size=0.3,
    random_state=123
)

pipe = make_pipeline(
    OrdinalEncoder(categories=categories),
    RandomForestClassifier(n_estimators=100, max_depth=20, random_state=321),
)
pipe.fit(model_df[covariates], model_df['DEFAULT'])

print(f"Accuracy in training set: {pipe.score(X_train, y_train)}")
print(f"Accuracy in the other samples: {pipe.score(X_test, y_test)}")

y_test_predict = pipe.predict(X_test)
confusion_matrix(y_test, y_test_predict)

Accuracy in training set: 0.9825125702454895
Accuracy in the other samples: 0.9817993616837747


array([[11287,     6],
       [  205,    95]], dtype=int64)

In [10]:
joblib.dump(pipe, 'pipe_defaulting')

['pipe_defaulting']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    model_df[covariates],
    model_df['DEFAULT'],
    test_size=0.3,
    random_state=123
)

pipe = make_pipeline(
    OrdinalEncoder(categories=categories),
    RandomForestClassifier(n_estimators=100, max_depth=20, random_state=321),
)
# pipe.fit(model_df[covariates], model_df['DEFAULT'])  # same results as previous
pipe.fit(X_train, y_train)  # confusion matrix changes drastically

print(f"Accuracy in training set: {pipe.score(X_train, y_train)}")
print(f"Accuracy in the other samples: {pipe.score(X_test, y_test)}")

y_test_predict = pipe.predict(X_test)
confusion_matrix(y_test, y_test_predict)


Accuracy in training set: 0.9832519964507542
Accuracy in the other samples: 0.9712757698611231


array([[11254,    39],
       [  294,     6]], dtype=int64)