In [None]:
# Nativos
import random as rn
import os
import sys
import gc

#calculo
import numpy as np
import pandas as pd
import scipy
from scipy import stats


#grafico
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from IPython.display import display_html
%matplotlib inline
sns.set(style="whitegrid")

#Interacciones con output
import warnings
warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

gc.collect()

BASE_DIR = os.path.dirname(os.getcwd())
if BASE_DIR not in sys.path: sys.path.append(BASE_DIR)

#from utils import *
#from graphs import *

SEED = 29082013
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

#subfolder = "data"
# os.listdir(subfolder)

## >> Utilitarios

In [None]:
def null_verificator(data):        
    if data.isnull().any().any():
        view_info = pd.DataFrame(
            pd.concat(
                [data.isnull().any(), 
                 data.isnull().sum(),
                 data.dtypes], 
                axis=1)
        )
        view_info.columns = ['Nulos', 'Cantidad', 'Tipo Col']
        size = data.shape[0]
        view_info['Porcentaje'] = view_info['Cantidad'].apply(
            lambda x: np.round(0 if not x else x*100 / size, 2))
        return view_info
    else:
        return pd.DataFrame.from_dict({'msje':"DATA LIMPIA DE NULOS"}, orient='index')

In [None]:
def display_horizontal(*args, percent_sep=5):
    html_str=''
    for table in args:
        df = table if isinstance(table, pd.DataFrame) else  pd.DataFrame(table)
        html_str+=df.to_html()
    display_html(
        html_str.replace(
            'table','table style="display:inline;padding-right:{}%"'.format(percent_sep)
        ), 
        raw=True)

In [None]:
def compare_var_train_test(train, test, col_, **kwargs):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
    
    # cantidad
    sns.displot(train[col_], bins=20, ax=axes[0])
    sns.displot(test[col_], bins=20, ax=axes[0])
    
    #densudad
    sns.kdeplot(train[col_], shade=True, ax=axes[1])
    sns.kdeplot(test[col_], shade=True, ax=axes[1])


In [None]:
def compare_var_train_test(train, test, col_, **kwargs):
    fig, axes_ = plt.subplots(nrows=1, ncols=4, figsize=(20, 4))
    
    # cantidad
    sns.histplot(train[col_], bins=20, ax=axes_[0], color='b')
    sns.histplot(test[col_], bins=20, ax=axes_[0], color='r')
    
    #densudad
    sns.kdeplot(train[col_], shade=True, ax=axes_[1], color='b')
    sns.kdeplot(test[col_], shade=True, ax=axes_[1], color='r')
    
    ## cajas
    sns.boxplot(y=train[col], ax=axes_[2], color='b')
    sns.boxplot(y=test[col], ax=axes_[3], color='r')

## >> Descripción de campos

## >> Load data

In [None]:
dicc_load = {
    'engine': 'openpyxl',
    'dtype': {'id': str}
}

In [None]:
train = pd.read_excel('score_train.xlsx',  **dicc_load)
display(train.head())

test = pd.read_excel('score_test.xlsx',  **dicc_load)
display(test.head())

In [None]:
train.shape, test.shape

## >> Análisis de nulos

In [None]:
display_horizontal(
    null_verificator(train), null_verificator(test)
)

Conclusión: Data sin nulos

## >> Revisión de Balanceo

In [None]:
train['Incumplimiento'].value_counts(dropna=False, normalize=True)

## >> Análisis Descriptivo

#### >>>> Cuantitativos

In [None]:
display_horizontal(
    train.describe().T, test.describe().T
)

### >> Correcion de valor_prestamo negativos

In [None]:
train['Valor_prestamo'] = train['Valor_prestamo'].apply(lambda _: 0 if _ < 0 else _)
test['Valor_prestamo'] = test['Valor_prestamo'].apply(lambda _: 0 if _ < 0 else _)

In [None]:
cols_num = [col for col in train.describe().columns if set(train[col]) != {0, 1}]
print(cols_num)

In [None]:
for col in cols_num:
    compare_var_train_test(train, test, col)

#### >>>> Cualitativos

In [None]:
display_horizontal(
    train.describe(include=[bool, object]).T, test.describe(include=[bool, object]).T
)

## >> Gráfico QQplot

In [None]:
from statsmodels.graphics.gofplots import qqplot

for col in cols_num:
    print("/"*50, col)
    fig = qqplot(train[col] , line='s')
    fig2 = qqplot(test[col] , line='s')
    plt.show()
    print("/"*100)

## >> Pruebas de Normalidad

In [None]:
def test_normalidad(data, col, method='kolmogorov', alpha=0.05, tipo='train'):
    metodos_validos = ['shapiro', 'kolmogorov', 'agostino']
    
    if method not in metodos_validos:
        return 'Ingrese un método válido'
    
    if method == 'shapiro':
        stat, p = stats.shapiro(data[col])
    elif method == 'kolmogorov':
        ks = stats.kstest(data[col], 'norm')
        p = ks.pvalue
        stat = ks.statistic
    elif method == 'agostino':
        stat, p = stats.normaltest(data[col])
    
    if p > alpha:
        msg = 'NORMALIDAD - No hay evidencia para rechzar la hipotesis nula'
    else:
        msg = 'NO GAUSSIANO - Se rechaza la hipotesis nula - NO NORMAL'
    
    return pd.DataFrame({
        'Nombre de variable':  col,
        'Tipo de data': tipo,
        'Tamaño de la muestra': 'train: {}, test: {} '.format(train.shape[0], test.shape[0]),
        'valor del estadístico': stat,
        'p-valor': p,
        'Resultado': msg
    }, index=[0])

#### >>>> Shapiro 

In [None]:
pd.concat( 
    [test_normalidad(train, col, method='shapiro', tipo='train') for col in cols_num] + [test_normalidad(test, col, method='shapiro', tipo='test') for col in cols_num] , axis=0, ignore_index=True
).sort_values(by=['Nombre de variable'], ascending=False)

#### >>>>  Kolmogorov Smirnov

In [None]:
pd.concat( 
    [test_normalidad(train, col, method='kolmogorov', tipo='train') for col in cols_num] + [test_normalidad(test, col, method='kolmogorov', tipo='test') for col in cols_num] , axis=0, ignore_index=True
).sort_values(by=['Nombre de variable'], ascending=False)

#### >>>> Agostino

In [None]:
pd.concat( 
    [test_normalidad(train, col, method='agostino', tipo='train') for col in cols_num] + [test_normalidad(test, col, method='agostino', tipo='test') for col in cols_num] , axis=0, ignore_index=True
).sort_values(by=['Nombre de variable'], ascending=False)

## >> Pruebas de Homocedasticidad

In [None]:
def test_homocedasticidad(train, test, col, method='bartlett', alpha=0.05):
    metodos_validos = ['levene', 'bartlett', 'fligner']
    
    if method not in metodos_validos:
        return 'Ingrese un método válido'
    
    if method == 'levene':
        prueba = stats.levene(train[col], test[col], center='median')
    elif method == 'bartlett':
        prueba = stats.bartlett(train[col], test[col])
    elif method == 'fligner':
        prueba = stats.fligner(train[col], test[col], center='median')

    p = prueba.pvalue
    stat = prueba.statistic
    
    if p > alpha:
        msg = 'HOMOCEDASTICIDAD - No hay evidencia para rechzar la hipotesis nula'
    else:
        msg = 'HETEROCEDASTICIDAD - Se rechaza la hipotesis nula - NO NORMAL'
    
    return pd.DataFrame({
        'Nombre de variable':  col,
        'Tamaño de la muestra': 'train: {}, test: {} '.format(train.shape[0], test.shape[0]),
        'valor del estadístico': stat,
        'p-valor': p,
        'Resultado': msg
    }, index=[0])

#### >>>> Barlett

In [None]:
pd.concat( 
    [test_homocedasticidad(train, test, col, method='bartlett') for col in cols_num], axis=0, ignore_index=True
)

#### >>>> Levene

In [None]:
pd.concat( 
    [test_homocedasticidad(train, test, col, method='levene') for col in cols_num], axis=0, ignore_index=True
)

#### >>>> Fligner

In [None]:
pd.concat( 
    [test_homocedasticidad(train, test, col, method='fligner') for col in cols_num], axis=0, ignore_index=True
)

## >> Analisis de Correlación

In [None]:
train.corr(method='spearman').style.background_gradient(
    cmap='coolwarm', axis=None
).set_precision(3)

In [None]:
test.corr(method='spearman').style.background_gradient(
    cmap='coolwarm', axis=None
).set_precision(3)

## >> Inspección de Gráicos Bivariado

In [None]:
def graph_numeric(data, col_init, col_out, **kwargs):
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 4))
    
    sns.lineplot(x=col_init, y=col_out, data=data, ax=axes[0])
    sns.violinplot(x=col_init, y=col_out, data=data, ax=axes[1], orient='h')
    
    legend_list = []
    
    for opt in data[col_out].unique():
        sns.kdeplot(data[data[col_out] == opt][col_init], ax=axes[2], shade=True)
        legend_list.append(opt)
    plt.legend(legend_list)

In [None]:
def graph_categoryc(data, col_init, col_out, **kwargs):
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
    
    cp = sns.countplot(x=col_init, hue=col_out, data=data, ax=axes[0])
    plt.setp(cp.get_xticklabels(), rotation=90)
    
    cross = pd.crosstab(data[col_out], data[col_init])
    sum_total = sum([cross[col].sum() for col in cross.columns])
    sns.heatmap(
        cross/sum_total, 
        annot=True, ax=axes[1], center=0, cmap="YlGnBu", fmt='.0%'
    )
    bp = sns.barplot(
        x=col_init, y=col_out, data=data, ax=axes[2]
    )
    
    plt.setp(bp.get_xticklabels(), rotation=90)

In [None]:
graph_numeric(train, 'Tiempo_empleo', 'Incumplimiento')

In [None]:
graph_numeric(train[train['Saldo_cuenta'] < 1000], 'Saldo_cuenta', 'Incumplimiento')

In [None]:
graph_numeric(train, 'Valor_prestamo', 'Incumplimiento')

In [None]:
graph_numeric(train, 'Cuentas_otros', 'Incumplimiento')

In [None]:
graph_numeric(train, 'Autocontrol', 'Incumplimiento')

In [None]:
graph_numeric(train, 'Impulsividad', 'Incumplimiento')

In [None]:
graph_numeric(train, 'Confianza', 'Incumplimiento')

In [None]:
import itertools

def div_simple(x, y):
    if x == 0 or y == 0:
        return 0
    
    return x / y

for a, b in list(itertools.combinations(['Cuentas_otros', 'Valor_prestamo', 'Saldo_cuenta', 'Tiempo_empleo'] + ['Autocontrol', 'Impulsividad', 'Confianza'], 2)):
    new_col = '{}_x_{}'.format(a, b).lower()
    
    train[new_col] = train[[a, b]].apply(lambda _: _[0]*_[1], axis=1)
    test[new_col] = test[[a, b]].apply(lambda _: _[0]*_[1], axis=1)
    
    if 'cuentas_otros' not in new_col:
        other_nwe = new_col  + '_entre_cuentas_otros'
        train[other_nwe] = train[[new_col, 'Cuentas_otros']].apply(lambda _: div_simple(_[0], _[1]), axis=1)
        test[other_nwe] = test[[new_col, 'Cuentas_otros']].apply(lambda _: div_simple(_[0], _[1]), axis=1)
        
    if 'tiempo_empleo' not in new_col:
        other_nwe = new_col  + '_entre_tiempo_empleo'
        train[other_nwe] = train[[new_col, 'Tiempo_empleo']].apply(lambda _: div_simple(_[0], _[1]), axis=1)
        test[other_nwe] = test[[new_col, 'Tiempo_empleo']].apply(lambda _: div_simple(_[0], _[1]), axis=1)
        
    graph_numeric(train, new_col, 'Incumplimiento')

In [None]:
view_graph = False

In [None]:
train.describe().T

## Nuevamente correlacion

In [None]:
matriz_corr = train.corr(method='spearman')

matriz_corr.style.background_gradient(
    cmap='coolwarm', axis=None
).set_precision(5)

In [None]:
pd.DataFrame(matriz_corr['Incumplimiento'].abs().sort_values(ascending=False)).style.background_gradient(
    cmap='coolwarm', axis=None
).set_precision(5)

## ESCALAMIENTO

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
X_train = train.drop(['id', 'Incumplimiento'], axis=1)
columnas_train = X_train.columns
X_test = test.drop(['id', 'Incumplimiento'], axis=1)
columnas_test = X_test.columns

all(columnas_train == columnas_test)

In [None]:
assert all(columnas_train == columnas_test)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
import joblib

joblib.dump(scaler, 'scaler.pkl')
load_scaler = joblib.load('scaler.pkl')    #--> para cargarlo en un ambiente externo

X_test = load_scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=columnas_train)
X_test = pd.DataFrame(X_test, columns=columnas_test)

In [None]:
X_train.corr(method='spearman').style.background_gradient(
    cmap='coolwarm', axis=None
).set_precision(5)

In [None]:
X_test.corr(method='spearman').style.background_gradient(
    cmap='coolwarm', axis=None
).set_precision(5)

### Separacion en TRAIN Y VALIDACION

In [None]:
X_test.head()

In [None]:
X_train.head()

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train = train['Incumplimiento'].copy()
del train
del test

In [None]:
y_train.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.85, random_state=SEED, stratify=y_train)

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

### SAVE

In [None]:
X_train.to_csv('X_train.csv', index=False)
X_valid.to_csv('X_valid.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

In [None]:
y_train.to_csv('y_train.csv', index=False)
y_valid.to_csv('y_valid.csv', index=False)

In [None]:
X_train['Imcumplimiento'] = y_train
X_valid['Imcumplimiento'] = y_valid
X_train.to_csv('X_train_r.csv', index=False)
X_valid.to_csv('X_valid_r.csv', index=False)

### Información de versionaiento

In [None]:
!pip install sinfo

In [None]:
from sinfo import sinfo
sinfo()