### Librerias y datos

In [1]:
# Librerias
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
from importlib import reload

# Libreria con las funciones personalizadas
import creditScoring_toolKit
reload(creditScoring_toolKit)
from creditScoring_toolKit import *

In [2]:
# Cargamos datos
datos = pd.read_csv('Data/loan_data.csv')

### Preparacion de base

In [3]:
# Sustituimos valores NAN por -1 en caso de variables numericas y "Sin informacion" en casto de categoricas
for variable in datos.columns:
    if datos[variable].dtype in (int,float):
        datos[variable] = datos[variable].fillna(-1)
    else:
        datos[variable] = datos[variable].fillna('Sin informacion')

### Creamos base train y test

##### En caso de aun no haber creado las bases train y test

In [4]:
# Dividimos la base en train y test
train, test = train_test_split(datos, train_size=0.7,random_state=123)

In [5]:
# Guardamos train y test para no volver a hacer la muestra mas adelante
train.to_csv('Data/train.csv',index=False)
test.to_csv('Data/test.csv',index=False)

##### En caso de ya haber creado las bases train y test

In [6]:
# Cargamos datos de csv
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

### Cálculo information value y WOEs variables

In [7]:
# Clasificamos variables en categoricas y numericas sin incluir variable target
numericas, categoricas = var_numericas_categoricas(train,['SK_ID_CURR','TARGET'])

##### Variables categoricas

In [8]:
# Calculamos IV y WOEs para todas las variables categoricas

# df vacios donde se almacenaran los resultados
iv_categoricas = pd.DataFrame()
woe_categoricas = pd.DataFrame()

# Aplicamos la funcion calculate_iv_cat a todas las variables categoricas
for variable in categoricas:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        iv, woe = calculate_iv_cat(base=train,variable=variable,target='TARGET')

    if iv_categoricas.empty:
        iv_categoricas = iv
        woe_categoricas = woe
    else:
        iv_categoricas = pd.concat([iv_categoricas,iv])
        woe_categoricas = pd.concat([woe_categoricas,woe])

# Eliminamos variables de apoyo
del(iv,woe,variable)

##### Variables numericas

In [9]:
# Calculamos IV y WOEs de variables numericas

# df vacios donde se almacenaran los resultados
iv_numericas = pd.DataFrame()
woe_numericas = pd.DataFrame()

for variable in numericas:
    # Calcula las categorias optimas de cada variable numerica
    categorias = optimal_binning(train,variable,'TARGET')
    
    iv, woe = calculate_iv_num(train,variable,'TARGET',categorias)

    if iv_numericas.empty:
        iv_numericas = iv
        woe_numericas = woe
    else:
        iv_numericas = pd.concat([iv_numericas,iv])
        woe_numericas = pd.concat([woe_numericas,woe])

# Eliminamos variables de apoyo
del(iv,woe,variable)

##### Unimos resultados de numericas y categoricas

In [10]:
# Unimos resultados de IV y WOEs
# Information value
resultados_iv = pd.concat([iv_numericas,iv_categoricas])

# Information woe
resultados_woe = pd.concat([woe_numericas,woe_categoricas])

del(iv_numericas,iv_categoricas,woe_numericas,woe_categoricas)

### Definimos bases X_train, X_test, y_train, y_test

In [13]:
# X_train
X_train = train.drop(columns=['SK_ID_CURR','TARGET'])

# X_test
X_test = test.drop(columns=['SK_ID_CURR','TARGET'])

# y_train
y_train = train['TARGET']

# y_test
y_test = test['TARGET']

### Base WOEs

In [14]:
# Creamos la base WOEs
for variable in X_train.columns:
    print(variable)

NAME_CONTRACT_TYPE
CODE_GENDER
FLAG_OWN_CAR
FLAG_OWN_REALTY
CNT_CHILDREN
AMT_INCOME_TOTAL
AMT_CREDIT
AMT_ANNUITY
AMT_GOODS_PRICE
NAME_TYPE_SUITE
NAME_INCOME_TYPE
NAME_EDUCATION_TYPE
NAME_FAMILY_STATUS
NAME_HOUSING_TYPE
REGION_POPULATION_RELATIVE
DAYS_BIRTH
DAYS_EMPLOYED
DAYS_REGISTRATION
DAYS_ID_PUBLISH
OWN_CAR_AGE
FLAG_MOBIL
FLAG_EMP_PHONE
FLAG_WORK_PHONE
FLAG_CONT_MOBILE
FLAG_PHONE
FLAG_EMAIL
OCCUPATION_TYPE
CNT_FAM_MEMBERS
REGION_RATING_CLIENT
REGION_RATING_CLIENT_W_CITY
WEEKDAY_APPR_PROCESS_START
HOUR_APPR_PROCESS_START
REG_REGION_NOT_LIVE_REGION
REG_REGION_NOT_WORK_REGION
LIVE_REGION_NOT_WORK_REGION
REG_CITY_NOT_LIVE_CITY
REG_CITY_NOT_WORK_CITY
LIVE_CITY_NOT_WORK_CITY
ORGANIZATION_TYPE
EXT_SOURCE_1
EXT_SOURCE_2
EXT_SOURCE_3
APARTMENTS_AVG
BASEMENTAREA_AVG
YEARS_BEGINEXPLUATATION_AVG
YEARS_BUILD_AVG
COMMONAREA_AVG
ELEVATORS_AVG
ENTRANCES_AVG
FLOORSMAX_AVG
FLOORSMIN_AVG
LANDAREA_AVG
LIVINGAPARTMENTS_AVG
LIVINGAREA_AVG
NONLIVINGAPARTMENTS_AVG
NONLIVINGAREA_AVG
APARTMENTS_MODE
BASEMEN

### Modelo logistico

In [15]:
# Definimos X_train, X_test, y_train, y_test

# Definimos X_train, X_test
X_train = train.drop(columns=['SK_ID_CURR','TARGET'])
X_test = test.drop(columns=['SK_ID_CURR','TARGET'])

# Definimos X_train, X_test
y_train = train['TARGET']
y_test = test['TARGET']

In [None]:
# Guardamos bases X_train, X_test, y_train, y_test
X_train.to_csv('Data/X_train.csv',index=False)
X_test.to_csv('Data/X_test.csv',index=False)
y_train.to_csv('Data/y_train.csv',index=False)
y_test.to_csv('Data/y_test.csv',index=False)