# Crear clase ternaria con SQL (basado en Z201)

A partir de competencia_01_crudo, crea la columna clase_ternaria que se usará como target

In [29]:
%%bash
pip install duckdb
pip install jupysql
pip install duckdb-engine



Configuracion la extensión de %sql para el notebook, lo que nos va a permitir usar lenguaje SQL directamente sobre una celda

In [30]:
import duckdb
import pandas as pd

#%load_ext sql
%reload_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Y finalmente cargamos el dataset en la base de datos

In [32]:
# dataset_path = '/home/aleb/DMEyF/2024/datos/' #
base_path =  '/content/drive/MyDrive/Data Science y similares/Maestría Data Mining Exactas/dmeyf/dmeyf2024/'
dataset_path=base_path+'datasets/'
dataset_file = 'competencia_01_crudo.csv'


In [33]:
%%sql
create or replace table competencia_01_crudo as
select
    *
from read_csv_auto("{{dataset_path + dataset_file}}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [34]:

total_registros = %sql select count(*) as cant from competencia_01_crudo
print(type(total_registros))
print ("El resultado del select es de tipo dataframe")

cant_registros = total_registros.loc[0, 'cant']
print (f"Hay {cant_registros} registros.")




<class 'pandas.core.frame.DataFrame'>
El resultado del select es de tipo dataframe
Hay 981946 registros.


In [35]:
%%sql
create or replace table competencia_01 as
with periodos as (
    select distinct foto_mes from competencia_01_crudo order by foto_mes
), clientes as (
    select distinct numero_de_cliente from competencia_01_crudo
), todo as (
    select numero_de_cliente, foto_mes from clientes cross join periodos order by numero_de_cliente, foto_mes
), clase_ternaria as (
    select
        c.*
        , if(c.numero_de_cliente is null, 0, 1) as mes_0
        , lead(mes_0, 1) over (partition by t.numero_de_cliente order by foto_mes) as mes_1
        , lead(mes_0, 2) over (partition by t.numero_de_cliente order by foto_mes) as mes_2
        , if(mes_2==1, 'CONTINUA', if(mes_1==1,'BAJA+2', if(mes_1==0,'BAJA+1', null))) as clase_ternaria -- Replazar null por la lógica que genera el target
    from todo t
    left join competencia_01_crudo c using (numero_de_cliente, foto_mes)
) select
  * EXCLUDE (mes_0, mes_1, mes_2)
from clase_ternaria
where mes_0 = 1

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


Revisamos que todo salga como esperamos

In [40]:
%sql select count(*) from competencia_01

Unnamed: 0,count_star()
0,981946


In [41]:
%%sql
select * from competencia_01 limit 10

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,Visa_fultimo_cierre,Visa_mpagado,Visa_mpagospesos,Visa_mpagosdolares,Visa_fechaalta,Visa_mconsumototal,Visa_cconsumos,Visa_cadelantosefectivo,Visa_mpagominimo,clase_ternaria
0,252847986,202101,1,0,0,60,287,4671.75,12167.83,4302.5,...,4,0.0,-118702.84,0.0,8743,73666.46,24,0,16586.22,CONTINUA
1,252847986,202102,1,0,0,60,288,4574.95,18902.09,2114.74,...,-3,0.0,-166968.46,0.0,8771,87594.45,35,0,17782.68,CONTINUA
2,252847986,202103,1,0,0,60,289,5899.86,25522.57,6568.34,...,1,0.0,-184995.61,0.0,8802,61237.4,25,0,15307.65,CONTINUA
3,252847986,202104,1,0,0,61,290,-2494.93,23583.41,-522.57,...,2,0.0,-133267.27,0.0,8832,46893.11,19,0,21114.0,CONTINUA
4,252847986,202105,1,0,0,61,291,-96.39,20999.56,3379.62,...,5,0.0,-140580.85,0.0,8863,69113.1,19,0,14592.12,BAJA+2
5,252847986,202106,1,0,0,61,292,-381.14,17236.93,2422.86,...,0,0.0,-145305.16,0.0,8893,89133.61,37,0,15155.16,
6,255766969,202101,1,0,0,66,213,9103.41,12002.82,14159.51,...,25,62335.41,-70535.97,0.0,9215,52299.1,22,0,3847.44,BAJA+2
7,255766969,202102,1,0,0,66,214,-2128.25,10130.07,-329.53,...,18,56980.48,-62335.41,0.0,9243,49541.09,22,0,3777.06,CONTINUA
8,255766969,202103,1,0,0,66,215,9691.23,19528.66,10125.11,...,21,57121.79,-56980.48,0.0,9274,49900.72,24,0,3577.65,CONTINUA
9,255766969,202104,1,0,0,66,216,1609.02,20456.49,1209.91,...,23,68480.25,-57121.79,0.0,9304,60096.15,20,0,4164.15,CONTINUA


Y vemos la cardinalidad de las clases por periodo

In [42]:
%%sql
PIVOT competencia_01
on clase_ternaria
USING count(numero_de_cliente)
GROUP BY foto_mes

Unnamed: 0,foto_mes,BAJA+1,BAJA+2,CONTINUA
0,202101,636,160387,1003
1,202102,785,1017,160844
2,202103,1020,981,161684
3,202104,982,1189,161919
4,202105,1192,163431,0
5,202106,0,0,0


Y finalmente almacenamos el nuevo dataset para ser usado a continuación

In [39]:
%%sql COPY competencia_01 TO '{{dataset_path}}competencia_01_prueba.csv' (FORMAT CSV, HEADER)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success
