In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from src.loader import select_data_c02
from os import path

In [87]:
# CONECTAR A BIGQUERY

ganancia_acierto = 780000
costo_estimulo = 20000

meses ='202012,202101,202102,202103,202104,202105,202106,202108'

In [88]:
from google.cloud import bigquery, bigquery_storage
import src.config as config
import polars as pl

client = bigquery.Client(project=config.BQ_PROJECT)
bqstorage_client = bigquery_storage.BigQueryReadClient()

query = f"""
    SELECT
        a.*
    FROM `{config.BQ_PROJECT}.{config.BQ_DATASET}.c02_ipc` AS a
    WHERE a.foto_mes IN ({meses})
"""

# Ejecutar la query y traer resultados como ArrowTable (más eficiente)
job = client.query(query)

# Uso Storage API para traer Arrow más rápido
arrow_table = job.result().to_arrow(bqstorage_client=bqstorage_client)


# Convertir ArrowTable → Polars DataFrame
df_pl = pl.from_arrow(arrow_table)

In [90]:

mes_train = [202012, 202101,202102,202103,202104,202105,202106]
mes_score = 202108
data = df_pl.to_pandas()


In [91]:
train_data = data[data['foto_mes'].isin(mes_train)]
score_data = data[data['foto_mes'] == mes_score]

train_null_percentage = train_data.isnull().mean() * 100
score_null_percentage = score_data.isnull().mean() * 100

comparison_df = pd.DataFrame({'Train Null Percentage': train_null_percentage, 'Score Null Percentage': score_null_percentage})
comparison_df['diff'] = (comparison_df['Score Null Percentage'] - comparison_df['Train Null Percentage']).abs()

comparison_df_sorted = comparison_df.sort_values('diff', ascending=False)

comparison_df_sorted


Unnamed: 0,Train Null Percentage,Score Null Percentage,diff
clase_ternaria,0.0000,100.000000,100.000000
Master_cconsumos,59.7084,57.971023,1.737376
Master_cadelantosefectivo,59.7084,57.971023,1.737376
Master_mconsumospesos,59.7084,57.971023,1.737376
Master_mconsumosdolares,59.7084,57.971023,1.737376
...,...,...,...
ctarjeta_master_debitos_automaticos,0.0000,0.000000,0.000000
cpagodeservicios,0.0000,0.000000,0.000000
cpagomiscuentas,0.0000,0.000000,0.000000
ccajeros_propios_descuentos,0.0000,0.000000,0.000000


In [92]:

train_zero_percentage = (train_data == 0).mean() * 100
score_zero_percentage = (score_data == 0).mean() * 100

comparison_df_zero = pd.DataFrame({'Train Zero Percentage': train_zero_percentage, 'Score Zero Percentage': score_zero_percentage})

comparison_df_zero['diff_zero_percentage'] = (comparison_df_zero['Score Zero Percentage'] - comparison_df_zero['Train Zero Percentage']).abs()
diff_zero_percentage_sorted = comparison_df_zero.sort_values('diff_zero_percentage',ascending=False)
diff_zero_percentage_sorted


Unnamed: 0,Train Zero Percentage,Score Zero Percentage,diff_zero_percentage
Master_fultimo_cierre,9.944130,0.000000,9.944130
Visa_fultimo_cierre,9.923769,0.000000,9.923769
Visa_mpagado,73.111346,68.961668,4.149679
mtransferencias_emitidas,40.874520,36.992028,3.882493
ctransferencias_emitidas,40.874345,36.992028,3.882317
...,...,...,...
cmobile_app_trx,0.000000,0.000000,0.000000
tmobile_app,0.000000,0.000000,0.000000
Master_mlimitecompra,0.000000,0.000000,0.000000
foto_mes,0.000000,0.000000,0.000000


In [45]:

def psi_orig(expected, actual, buckets=10):

    def psi_formula(expected_prop, actual_prop):
        result = (actual_prop - expected_prop) * np.log(actual_prop / expected_prop)
        return result

    expected_not_null = expected.dropna()
    actual_not_null = actual.dropna()

    bin_edges = pd.qcut(expected_not_null, q=buckets, duplicates='drop').unique()
    bin_edges2 = [edge.left for edge in bin_edges] + [edge.right for edge in bin_edges]
    breakpoints = sorted(list(set(bin_edges2)))

    expected_counts, _ = np.histogram(expected_not_null, bins=breakpoints)
    actual_counts, _ = np.histogram(actual_not_null, bins=breakpoints)

    expected_prop = expected_counts / len(expected_not_null)
    actual_prop = actual_counts / len(actual_not_null)

    psi_not_null = psi_formula(expected_prop, actual_prop).sum()

    psi_null = 0

    if expected.isnull().sum() > 0 and actual.isnull().sum() > 0 :
      expected_null_percentage = expected.isnull().mean()
      actual_null_percentage = actual.isnull().mean()
      psi_null = psi_formula(expected_null_percentage, actual_null_percentage)

    return psi_not_null + psi_null



buckets=10
column = 'mprestamos_personales'
expected = train_data[column]
actual = score_data[column]

def psi_formula(expected_prop, actual_prop):
    result = (actual_prop - expected_prop) * np.log(actual_prop / expected_prop)
    return result

expected_not_null = expected.dropna()
actual_not_null = actual.dropna()

bin_edges = pd.qcut(expected_not_null, q=buckets, duplicates='drop').unique()
bin_edges2 = [edge.left for edge in bin_edges] + [edge.right for edge in bin_edges]
breakpoints = sorted(list(set(bin_edges2)))

expected_counts, _ = np.histogram(expected_not_null, bins=breakpoints)
actual_counts, _ = np.histogram(actual_not_null, bins=breakpoints)

expected_prop = expected_counts / len(expected_not_null)
actual_prop = actual_counts / len(actual_not_null)

psi_not_null = psi_formula(expected_prop, actual_prop).sum()

psi_null = 0

psi_value = psi(expected, actual)



In [93]:
import numpy as np
import pandas as pd

def psi(expected, actual, buckets=10):
    def psi_formula(expected_prop, actual_prop):
        # epsilon para evitar log(0)
        eps = 1e-6
        expected_prop = np.where(expected_prop == 0, eps, expected_prop)
        actual_prop   = np.where(actual_prop == 0, eps, actual_prop)
        return (actual_prop - expected_prop) * np.log(actual_prop / expected_prop)

    # Quitamos nulos sólo para el cálculo de bins
    expected_not_null = expected.dropna()
    actual_not_null   = actual.dropna()

    # Si no hay datos suficientes, devolvemos 0 para no romper
    if len(expected_not_null) == 0 or len(actual_not_null) == 0:
        return 0.0

    # qcut con retbins=True: nos da los bins explícitos
    try:
        _, breakpoints = pd.qcut(
            expected_not_null,
            q=buckets,
            retbins=True,
            duplicates='drop'
        )
    except ValueError:
        # Por si hay muy pocos valores distintos y qcut no puede armar bins
        return 0.0

    # Nos aseguramos de que los breakpoints sean únicos y ordenados
    breakpoints = np.unique(breakpoints)

    # Si hay menos de 2 puntos de corte, no se puede armar histograma
    if len(breakpoints) < 2:
        return 0.0

    # Conteos en cada bin
    expected_counts, _ = np.histogram(expected_not_null, bins=breakpoints)
    actual_counts, _   = np.histogram(actual_not_null,  bins=breakpoints)

    expected_prop = expected_counts / len(expected_not_null)
    actual_prop   = actual_counts   / len(actual_not_null)

    psi_not_null = psi_formula(expected_prop, actual_prop).sum()

    # ---- componente de nulos ----
    psi_null = 0.0
    if expected.isnull().any() or actual.isnull().any():
        expected_null_percentage = expected.isnull().mean()
        actual_null_percentage   = actual.isnull().mean()
        psi_null = psi_formula(
            np.array([expected_null_percentage]),
            np.array([actual_null_percentage])
        ).sum()

    return float(psi_not_null + psi_null)


In [78]:
def calcular_psi(train_data, score_data):
    psi_results = []
    for column in train_data.columns:
        if column not in [
            'foto_mes', 'clase_ternaria',
            'ctarjeta_master_descuentos','mtarjeta_master_descuentos',
            'mtarjeta_visa_descuentos','ctarjeta_visa_descuentos',
            'ccajeros_propios_descuentos','mcajeros_propios_descuentos'
        ]:
            train_variable  = train_data[column]
            score_variable  = score_data[column]
            psi_value       = psi(train_variable, score_variable)
            psi_results.append({'feature': column, 'psi': psi_value})

    psi_df = pd.DataFrame(psi_results)
    psi_df = psi_df.sort_values('psi', ascending=False)
    psi_df = psi_df[psi_df.psi > 0.1]

    return psi_df

psi_mes = {}
for mes in train_data.foto_mes.unique():
    d_train = train_data.loc[train_data.foto_mes == mes]
    psi_mes[mes] = calcular_psi(d_train, score_data)


In [94]:
def calcular_psi(train_data, score_data):
    psi_results = []
    print(f' Calculo PSI para {train_data.foto_mes.unique()}')
    for column in train_data.columns:
      if column not in ['foto_mes', 'clase_ternaria', 'ctarjeta_master_descuentos','mtarjeta_master_descuentos','mtarjeta_visa_descuentos','ctarjeta_visa_descuentos','ccajeros_propios_descuentos','mcajeros_propios_descuentos']:
        #print(column)
        if train_data[column].dtype.kind not in "iufc":
            print(f'columna {column} no es un int')
            continue

        train_variable = train_data[column]
        score_variable = score_data[column]
        psi_value = psi(train_variable, score_variable)
        psi_results.append({'feature': column, 'psi': psi_value})

    psi_df = pd.DataFrame(psi_results)
    psi_df = psi_df.sort_values('psi', ascending=False)
    psi_df = psi_df[psi_df.psi > 0.1]

    return psi_df

In [95]:
psi_mes = {}
for mes in train_data.foto_mes.unique():
    print(mes)
    d_train = train_data.loc[train_data.foto_mes == mes]
    psi_mes[mes] = calcular_psi(d_train, score_data)
    print(psi_mes[mes])


202106
 Calculo PSI para [202106]
columna tmobile_app no es un int
columna cmobile_app_trx no es un int
columna Master_Finiciomora no es un int
columna Visa_Finiciomora no es un int
                         feature       psi
73             q_producto_master  2.534624
74               q_producto_visa  0.549280
28                  cpayroll_trx  0.229340
138           Visa_mlimitecompra  0.222768
118  Master_mfinanciacion_limite  0.217175
124         Master_mlimitecompra  0.211481
132    Visa_mfinanciacion_limite  0.207960
141                 Visa_mpagado  0.196783
98                      mpayroll  0.157448
202105
 Calculo PSI para [202105]
columna tmobile_app no es un int
columna cmobile_app_trx no es un int
columna Master_Finiciomora no es un int
columna Visa_Finiciomora no es un int
                         feature       psi
118  Master_mfinanciacion_limite  0.172096
124         Master_mlimitecompra  0.167521
138           Visa_mlimitecompra  0.144594
132    Visa_mfinanciacion_limite  

In [96]:
array_df = []
for k,v in psi_mes.items():
    v['mes'] = k
    array_df.append(v)
array_df = pd.concat(array_df)

In [97]:
pd.pivot_table(array_df, index='feature', columns='mes', values='psi')

mes,202012,202101,202102,202103,202104,202105,202106
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Master_Fvencimiento,0.617444,0.542862,0.269267,0.1811,0.100316,,
Master_fultimo_cierre,,,11.431339,,,,
Master_mfinanciacion_limite,0.368448,0.364834,0.360032,0.325557,0.166555,0.172096,0.217175
Master_mlimitecompra,0.379874,0.376173,0.366448,0.335004,0.175404,0.167521,0.211481
Master_mpagado,0.392256,,,,,,
Visa_fultimo_cierre,,,10.629256,,,,
Visa_mfinanciacion_limite,0.251878,0.259053,0.257343,0.242606,0.134728,0.13639,0.20796
Visa_mlimitecompra,0.26263,0.270656,0.260215,0.24818,0.135581,0.144594,0.222768
Visa_mpagado,0.776761,,0.126883,,,,0.196783
cpayroll_trx,0.735267,,,,,,0.22934


In [None]:
['Master_Finiciomora',
'Master_Fvencimiento',
'Master_fultimo_cierre',
'Visa_Finiciomora',
'Visa_fultimo_cierre',
'Visa_mpagado',
'cpayroll_trx',
'mcaja_ahorro_adicional',
'mcomisiones',
'mcomisiones_mantenimiento',
'mcomisiones_otras',
'mpayroll',
'mrentabilidad',
'mtransferencias_recibidas']


In [None]:
['Master_Fvencimiento',
'Master_fultimo_cierre',
'Master_mfinanciacion_limite',
'Master_mlimitecompra',
'Master_mpagado',
'Visa_fultimo_cierre',
'Visa_mfinanciacion_limite',
'Visa_mlimitecompra',
'Visa_mpagado',
'cpayroll_trx',
'mpayroll',
'q_producto_master',
'q_producto_visa']
