# Título  Variables con RF
## Autor: Jose Chelquer
## Fecha de última modificación: 21/11/2024
## Descripción:
Agrega features corriendo RF con lgbm.
Agrega variables por cada hoja del lgbm, indicando si la observación está o no ahi.

Para entrenar y predecir, no usa las variables relacionadas con foto_mes


## Parámetros

< Descripción de cada uno de los parámetros que utiliza el job >


In [1]:
# vm o local?
vm=True
if vm:
  usar_gdrive=False
else:
  usar_gdrive=True      #se va a usar google dirve


In [2]:
semillas = [17,19,23,29,31]
ganancia_acierto=273000
costo_estimulo=7000

In [3]:
# datos de entrenamiento
if vm:
  meses_entrenamiento=[202010, 202011, 202012]
  submuestrear=False
else:
  meses_entrenamiento=[202101, 202102, 202103]
  submuestrear=False

grabar_importancias=False          # Se puede pedir que grabe las importancias de variables como resultado secundario
importancias_file='importancias_rf.csv.gz'


## Input

< Archivos de datos (parquet.gz) con sus paths que van a consumirse por el job>

In [4]:
# El script se adapta a archivos .parquet o .parquet.gz
if vm:
  dataset_path = '/home/jose/buckets/b1/datasets/'
  dataset_file='k2_aumentada.parquet.gz'
else:
  dataset_path='/content/drive/MyDrive/Data Science y similares/Maestría Data Mining Exactas/dmeyf/dmeyf2024/datasets/'
  dataset_file='k2_sample_parquet.gz'

## Output

< Archivos, bases de datos, modelos que va a generar el job>

In [5]:
# el script se adapta a datasets .parquet o .gz
if vm:
  output_file='k2_aumentada_conRF.parquet.gz'
else:
  output_file='k2_sample.conRF.parquet.gz'


## Procesos

### Paquetes necesarios

In [6]:
#%pip install optuna==3.6.1
%pip install optuna==4.0


Note: you may need to restart the kernel to use updated packages.


## Código del proceso

< Todo el código a partir de aquí debe poder ejecutarse sin necesidad de parametrizar nada>

Instalamos, cargamos y seteamos el entorno

In [7]:
#%pip install scikit-learn==1.3.2
#%pip install seaborn==0.13.1
#%pip install numpy==1.26.4
#%pip install matplotlib==3.7.1



## Gdrive?

In [8]:
if usar_gdrive:
  from google.colab import drive
  drive.mount('/content/drive')

## Librerías

In [9]:
!pip install --upgrade pip
!pip install --upgrade lightgbm
!pip install dask[dataframe]



In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import shutil

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample

from joblib import Parallel, delayed

import optuna
from optuna.storages import JournalStorage
from optuna.storages.journal import JournalFileBackend
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time
from sqlalchemy import create_engine

import lightgbm as lgb
from math import exp

  from .autonotebook import tqdm as notebook_tqdm


## Leer dataset

In [11]:
data_original = pd.read_parquet(os.path.join(dataset_path, dataset_file))

In [12]:
def contabilizar(df, descripción):
  print (f'\nContabilización de la base {descripción}\n')
  print (f'\nShape: {df.shape}\n')
  print (f"\ por mes: \n{df['foto_mes'].value_counts()}\n")
  print (f"\nFilas por mes y clase: \n{pd.crosstab(df['foto_mes'], df['clase_ternaria'])}\n")
contabilizar(data_original, 'Datos Leídos')


Contabilización de la base Datos Leídos


Shape: (375309, 1080)

\ por mes: 
foto_mes
202108    165442
202104     18363
202105     18352
202106     18271
202103     18169
202102     17886
202101     17481
202012     17307
202011     17097
202010     16921
202009     16771
202007     16629
202008     16620
Name: count, dtype: int64


Filas por mes y clase: 
clase_ternaria  BAJA+1  BAJA+2  CONTINUA
foto_mes                                
202007             627     542     15460
202008             544     472     15604
202009             474     564     15733
202010             565     488     15868
202011             490     646     15961
202012             649     634     16024
202101             635     785     16061
202102             785    1017     16084
202103            1020     981     16168
202104             982    1189     16192
202105            1189     911     16252
202106             908    1074     16289
202108          165442       0         0



  print (f"\ por mes: \n{df['foto_mes'].value_counts()}\n")


## Recodificar a clase binaria

In [13]:
data_original['clase_peso'] = 1.0

data_original.loc[data_original['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data_original.loc[data_original['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [14]:
data_original['clase_binaria1'] = 0
data_original['clase_binaria2'] = 0
data_original['clase_binaria1'] = np.where(data_original['clase_ternaria'] == 'BAJA+2', 1, 0)
data_original['clase_binaria2'] = np.where(data_original['clase_ternaria'] == 'CONTINUA', 0, 1)

In [15]:
# Elige los meses y balancea
if meses_entrenamiento:
  data = data_original[data_original['foto_mes'].isin(meses_entrenamiento)]
else:
  data = data_original.copy()
# Elimina las variables asociadas a foto_mes
cols_fotos=[col for col in data.columns if 'foto_mes_lag' in col]
data=data.drop(columns=cols_fotos)

#Submuestrear para balancear
if submuestrear:
  # Separar las clases
  data_0 = data[data['clase_binaria2'] == 0]
  data_1 = data[data['clase_binaria2'] == 1]
  n_samples=min(len(data_0), len(data_1))

  # Hacer un submuestreo de la clase mayoritaria
  data_0 = resample(data_0,
                    replace=False,  # No reemplace
                    n_samples=n_samples,  # Igualar tamaño de la clase minoritaria
                    random_state=123)  # Para reproducibilidad
  data_1 = resample(data_1,
                    replace=False,  # No reemplace
                    n_samples=n_samples,  # Igualar tamaño de la clase minoritaria
                    random_state=123)  # Para reproducibilidad

  # Combinar clases balanceadas
  data_balanced = pd.concat([data_0, data_1])

  # Mezclar el DataFrame resultante
  data = data_balanced.sample(frac=1, random_state=123).reset_index(drop=True)

contabilizar (data, 'Datos para hacer RF')

y = data['clase_binaria2']
X=data.drop(['clase_ternaria','clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)




Contabilización de la base Datos para hacer RF


Shape: (51325, 1083)

\ por mes: 
foto_mes
202012    17307
202011    17097
202010    16921
Name: count, dtype: int64


Filas por mes y clase: 
clase_ternaria  BAJA+1  BAJA+2  CONTINUA
foto_mes                                
202010             565     488     15868
202011             490     646     15961
202012             649     634     16024



## Función ganancia

In [16]:
def rf_gan_eval(y_pred, data):
    clase_ternaria = data.get_clase_ternaria()
    # Diferencia si eran BAJA+1 o BAJA+2
    ganancia = np.where(clase_ternaria == 'BAJA+2', ganancia_acierto, 0) - np.where(clase_ternaria !='BAJA+2', costo_estimulo, 0)
    #Ordena ganancia según los índices ordenados de y_pred de mayor a menor
    ganancia = ganancia[np.argsort(y_pred)[::-1]] #: desde todo : hasta todo :-1 step hacia atrás
    # Ganancias acumuladas so far
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

def ganancia_prob(y_hat, y, prop=1, class_index=1, threshold=0.025):
  @np.vectorize
  def ganancia_row(predicted, actual, threshold=0.025):
    return  (predicted >= threshold) * (ganancia_acierto if actual == "BAJA+2" else -costo_estimulo)

  return ganancia_row(y_hat[:,class_index], y).sum() / prop



## Imputar NANs

In [17]:
print (f'Cant de Nans:{X.isnull().sum()}')
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#if X.isnull().values.any():
#  X = imp_mean.fit_transform(X)


Cant de Nans:numero_de_cliente                     0
foto_mes                              0
active_quarter                        0
cliente_vip                           0
internet                              0
                                  ...  
Tarjeta_mpagosdolares_n10_lag2    48082
Tarjeta_mconsumototal_n10_lag1    32635
Tarjeta_mconsumototal_n10_lag2    48082
Tarjeta_mpagominimo_n10_lag1      32635
Tarjeta_mpagominimo_n10_lag2      48082
Length: 1079, dtype: int64


## Ajustar

In [18]:
# Cambio
# Fitear el modelo con X e y
model = lgb.LGBMClassifier(
    # parametros que se pueden cambiar
    num_iterations = 20,
    num_leaves  = 16,
    min_data_in_leaf = 1000,
    feature_fraction_bynode  = 0.2,

    # para que LightGBM emule Random Forest
    boosting_type = "rf",
    bagging_fraction = ( 1.0 - 1.0/exp(1.0) ),
    bagging_freq = 1,
    feature_fraction = 1,

    # genericos de LightGBM
    max_bin = 31,
    objective = "binary",
    first_metric_only = True,
    boost_from_average = True,
    feature_pre_filter = False,
    force_row_wise = True,
    verbosity = -100,
    max_depth = -1,
    min_gain_to_split = 0.0,
    min_sum_hessian_in_leaf = 0.001,
    lambda_l1 = 0.0,
    lambda_l2 = 0.0,

    pos_bagging_fraction = 1.0,
    neg_bagging_fraction = 1.0,
    is_unbalance = True,
    scale_pos_weight = 1.0,

    drop_rate = 0.1,
    max_drop = 50,
    skip_drop = 0.5,

    extra_trees = False
  )
print("Fiteando")
model.fit(X, y)
print("Fin del fit")



Fiteando




Fin del fit


In [19]:
features = data.drop(['clase_ternaria','clase_peso', 'clase_binaria1','clase_binaria2'], axis=1).columns
importances = model.feature_importances_
feat_importances = pd.DataFrame({'feature': features, 'importance': importances})
feat_importances = feat_importances.sort_values('importance', ascending=False)
print(feat_importances.head(20))

if grabar_importancias:
    if importancias_file.endswith('.gz'):
      feat_importances.to_parquet(os.path.join(dataset_path,  importancias_file), index=False, compression='gzip')
    else:
      feat_importances.to_parquet(os.path.join(dataset_path,  importancias_file), index=False)

                                      feature  importance
211           ratio_mcomisiones_mantenimiento          19
52                                   mpayroll          12
201                            ratio_mpayroll          10
212                   ratio_mcomisiones_otras          10
107                              ctrx_quarter          10
181                         ratio_mcomisiones          10
274                       mpasivos_margen_n10           9
51                               cpayroll_trx           9
180                ratio_mrentabilidad_annual           8
73                          ccomisiones_otras           7
155                    ctarjeta_transacciones           7
28                      mtarjeta_visa_consumo           7
280                        mcuentas_saldo_n10           5
461                             mpayroll_lag1           5
27                ctarjeta_visa_transacciones           5
188                ratio_mcaja_ahorro_dolares           5
25            

## Crear variables RF

In [20]:
# Ahora, con todos los datos
data = data_original.copy()
cols_fotos=[col for col in data.columns if 'foto_mes_lag' in col]
data=data.drop(columns=cols_fotos)
y_completo=data['clase_binaria2']
X_completo=data.drop(['clase_ternaria','clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)


In [21]:
# Obtener las hojas para cada observación
leaf_indices = model.predict(X_completo, pred_leaf=True)
leaf_indices.shape

(375309, 20)

In [22]:
print(model.get_params())

{'boosting_type': 'rf', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 16, 'objective': 'binary', 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'num_iterations': 20, 'min_data_in_leaf': 1000, 'feature_fraction_bynode': 0.2, 'bagging_fraction': 0.6321205588285577, 'bagging_freq': 1, 'feature_fraction': 1, 'max_bin': 31, 'first_metric_only': True, 'boost_from_average': True, 'feature_pre_filter': False, 'force_row_wise': True, 'verbosity': -100, 'min_gain_to_split': 0.0, 'min_sum_hessian_in_leaf': 0.001, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'pos_bagging_fraction': 1.0, 'neg_bagging_fraction': 1.0, 'is_unbalance': True, 'scale_pos_weight': 1.0, 'drop_rate': 0.1, 'max_drop': 50, 'skip_drop': 0.5, 'extra_trees': False}


In [23]:
# Crear el OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False para obtener un array denso
# Ajustar y transformar los índices de las hojas
leaf_dummies = encoder.fit_transform(leaf_indices)
print (f'Shape de leaf_dummies: {leaf_dummies.shape}')

# Obtener nombres de las columnas para las variables dummy
n_trees = leaf_indices.shape[1]
print (f"Cant de árboles: {n_trees}")
leaf_columns = [f'tree_{i}_leaf_{leaf}' for i in range(n_trees) for leaf in range(encoder.categories_[i].size)]

# Convertir a DataFrame
leaf_dummies_df = pd.DataFrame(leaf_dummies, columns=leaf_columns)
data_extended=data_original.copy()
num_var=0
for campo in leaf_dummies_df.columns:
  num_var+=1
  print (f'{num_var} Agregando campo: {campo}')
  data_extended[campo]=leaf_dummies_df[campo]
print (f'Nuevo shape: {data_extended.shape}')

Shape de leaf_dummies: (375309, 320)
Cant de árboles: 20
1 Agregando campo: tree_0_leaf_0
2 Agregando campo: tree_0_leaf_1
3 Agregando campo: tree_0_leaf_2
4 Agregando campo: tree_0_leaf_3
5 Agregando campo: tree_0_leaf_4
6 Agregando campo: tree_0_leaf_5
7 Agregando campo: tree_0_leaf_6
8 Agregando campo: tree_0_leaf_7
9 Agregando campo: tree_0_leaf_8
10 Agregando campo: tree_0_leaf_9
11 Agregando campo: tree_0_leaf_10
12 Agregando campo: tree_0_leaf_11
13 Agregando campo: tree_0_leaf_12
14 Agregando campo: tree_0_leaf_13
15 Agregando campo: tree_0_leaf_14
16 Agregando campo: tree_0_leaf_15
17 Agregando campo: tree_1_leaf_0
18 Agregando campo: tree_1_leaf_1
19 Agregando campo: tree_1_leaf_2
20 Agregando campo: tree_1_leaf_3
21 Agregando campo: tree_1_leaf_4
22 Agregando campo: tree_1_leaf_5
23 Agregando campo: tree_1_leaf_6
24 Agregando campo: tree_1_leaf_7
25 Agregando campo: tree_1_leaf_8
26 Agregando campo: tree_1_leaf_9
27 Agregando campo: tree_1_leaf_10
28 Agregando campo: tree_1_

  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

104 Agregando campo: tree_6_leaf_7
105 Agregando campo: tree_6_leaf_8
106 Agregando campo: tree_6_leaf_9
107 Agregando campo: tree_6_leaf_10
108 Agregando campo: tree_6_leaf_11
109 Agregando campo: tree_6_leaf_12
110 Agregando campo: tree_6_leaf_13
111 Agregando campo: tree_6_leaf_14
112 Agregando campo: tree_6_leaf_15
113 Agregando campo: tree_7_leaf_0
114 Agregando campo: tree_7_leaf_1
115 Agregando campo: tree_7_leaf_2
116 Agregando campo: tree_7_leaf_3
117 Agregando campo: tree_7_leaf_4
118 Agregando campo: tree_7_leaf_5
119 Agregando campo: tree_7_leaf_6
120 Agregando campo: tree_7_leaf_7
121 Agregando campo: tree_7_leaf_8
122 Agregando campo: tree_7_leaf_9
123 Agregando campo: tree_7_leaf_10
124 Agregando campo: tree_7_leaf_11
125 Agregando campo: tree_7_leaf_12


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

126 Agregando campo: tree_7_leaf_13
127 Agregando campo: tree_7_leaf_14
128 Agregando campo: tree_7_leaf_15
129 Agregando campo: tree_8_leaf_0
130 Agregando campo: tree_8_leaf_1
131 Agregando campo: tree_8_leaf_2
132 Agregando campo: tree_8_leaf_3
133 Agregando campo: tree_8_leaf_4
134 Agregando campo: tree_8_leaf_5
135 Agregando campo: tree_8_leaf_6
136 Agregando campo: tree_8_leaf_7
137 Agregando campo: tree_8_leaf_8
138 Agregando campo: tree_8_leaf_9
139 Agregando campo: tree_8_leaf_10
140 Agregando campo: tree_8_leaf_11
141 Agregando campo: tree_8_leaf_12
142 Agregando campo: tree_8_leaf_13
143 Agregando campo: tree_8_leaf_14
144 Agregando campo: tree_8_leaf_15
145 Agregando campo: tree_9_leaf_0
146 Agregando campo: tree_9_leaf_1
147 Agregando campo: tree_9_leaf_2
148 Agregando campo: tree_9_leaf_3


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

149 Agregando campo: tree_9_leaf_4
150 Agregando campo: tree_9_leaf_5
151 Agregando campo: tree_9_leaf_6
152 Agregando campo: tree_9_leaf_7
153 Agregando campo: tree_9_leaf_8
154 Agregando campo: tree_9_leaf_9
155 Agregando campo: tree_9_leaf_10
156 Agregando campo: tree_9_leaf_11
157 Agregando campo: tree_9_leaf_12
158 Agregando campo: tree_9_leaf_13
159 Agregando campo: tree_9_leaf_14
160 Agregando campo: tree_9_leaf_15
161 Agregando campo: tree_10_leaf_0
162 Agregando campo: tree_10_leaf_1
163 Agregando campo: tree_10_leaf_2
164 Agregando campo: tree_10_leaf_3
165 Agregando campo: tree_10_leaf_4
166 Agregando campo: tree_10_leaf_5
167 Agregando campo: tree_10_leaf_6
168 Agregando campo: tree_10_leaf_7
169 Agregando campo: tree_10_leaf_8
170 Agregando campo: tree_10_leaf_9
171 Agregando campo: tree_10_leaf_10


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

172 Agregando campo: tree_10_leaf_11
173 Agregando campo: tree_10_leaf_12
174 Agregando campo: tree_10_leaf_13
175 Agregando campo: tree_10_leaf_14
176 Agregando campo: tree_10_leaf_15
177 Agregando campo: tree_11_leaf_0
178 Agregando campo: tree_11_leaf_1
179 Agregando campo: tree_11_leaf_2
180 Agregando campo: tree_11_leaf_3
181 Agregando campo: tree_11_leaf_4
182 Agregando campo: tree_11_leaf_5
183 Agregando campo: tree_11_leaf_6
184 Agregando campo: tree_11_leaf_7
185 Agregando campo: tree_11_leaf_8
186 Agregando campo: tree_11_leaf_9
187 Agregando campo: tree_11_leaf_10
188 Agregando campo: tree_11_leaf_11
189 Agregando campo: tree_11_leaf_12
190 Agregando campo: tree_11_leaf_13
191 Agregando campo: tree_11_leaf_14
192 Agregando campo: tree_11_leaf_15
193 Agregando campo: tree_12_leaf_0


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

194 Agregando campo: tree_12_leaf_1
195 Agregando campo: tree_12_leaf_2
196 Agregando campo: tree_12_leaf_3
197 Agregando campo: tree_12_leaf_4
198 Agregando campo: tree_12_leaf_5
199 Agregando campo: tree_12_leaf_6
200 Agregando campo: tree_12_leaf_7
201 Agregando campo: tree_12_leaf_8
202 Agregando campo: tree_12_leaf_9
203 Agregando campo: tree_12_leaf_10
204 Agregando campo: tree_12_leaf_11
205 Agregando campo: tree_12_leaf_12
206 Agregando campo: tree_12_leaf_13
207 Agregando campo: tree_12_leaf_14
208 Agregando campo: tree_12_leaf_15
209 Agregando campo: tree_13_leaf_0
210 Agregando campo: tree_13_leaf_1
211 Agregando campo: tree_13_leaf_2
212 Agregando campo: tree_13_leaf_3
213 Agregando campo: tree_13_leaf_4
214 Agregando campo: tree_13_leaf_5
215 Agregando campo: tree_13_leaf_6
216 Agregando campo: tree_13_leaf_7


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

217 Agregando campo: tree_13_leaf_8
218 Agregando campo: tree_13_leaf_9
219 Agregando campo: tree_13_leaf_10
220 Agregando campo: tree_13_leaf_11
221 Agregando campo: tree_13_leaf_12
222 Agregando campo: tree_13_leaf_13
223 Agregando campo: tree_13_leaf_14
224 Agregando campo: tree_13_leaf_15
225 Agregando campo: tree_14_leaf_0
226 Agregando campo: tree_14_leaf_1
227 Agregando campo: tree_14_leaf_2
228 Agregando campo: tree_14_leaf_3
229 Agregando campo: tree_14_leaf_4
230 Agregando campo: tree_14_leaf_5
231 Agregando campo: tree_14_leaf_6
232 Agregando campo: tree_14_leaf_7
233 Agregando campo: tree_14_leaf_8
234 Agregando campo: tree_14_leaf_9
235 Agregando campo: tree_14_leaf_10
236 Agregando campo: tree_14_leaf_11
237 Agregando campo: tree_14_leaf_12
238 Agregando campo: tree_14_leaf_13
239 Agregando campo: tree_14_leaf_14


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

240 Agregando campo: tree_14_leaf_15
241 Agregando campo: tree_15_leaf_0
242 Agregando campo: tree_15_leaf_1
243 Agregando campo: tree_15_leaf_2
244 Agregando campo: tree_15_leaf_3
245 Agregando campo: tree_15_leaf_4
246 Agregando campo: tree_15_leaf_5
247 Agregando campo: tree_15_leaf_6
248 Agregando campo: tree_15_leaf_7
249 Agregando campo: tree_15_leaf_8
250 Agregando campo: tree_15_leaf_9
251 Agregando campo: tree_15_leaf_10
252 Agregando campo: tree_15_leaf_11
253 Agregando campo: tree_15_leaf_12
254 Agregando campo: tree_15_leaf_13
255 Agregando campo: tree_15_leaf_14
256 Agregando campo: tree_15_leaf_15
257 Agregando campo: tree_16_leaf_0
258 Agregando campo: tree_16_leaf_1
259 Agregando campo: tree_16_leaf_2
260 Agregando campo: tree_16_leaf_3
261 Agregando campo: tree_16_leaf_4


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

262 Agregando campo: tree_16_leaf_5
263 Agregando campo: tree_16_leaf_6
264 Agregando campo: tree_16_leaf_7
265 Agregando campo: tree_16_leaf_8
266 Agregando campo: tree_16_leaf_9
267 Agregando campo: tree_16_leaf_10
268 Agregando campo: tree_16_leaf_11
269 Agregando campo: tree_16_leaf_12
270 Agregando campo: tree_16_leaf_13
271 Agregando campo: tree_16_leaf_14
272 Agregando campo: tree_16_leaf_15
273 Agregando campo: tree_17_leaf_0
274 Agregando campo: tree_17_leaf_1
275 Agregando campo: tree_17_leaf_2
276 Agregando campo: tree_17_leaf_3
277 Agregando campo: tree_17_leaf_4
278 Agregando campo: tree_17_leaf_5
279 Agregando campo: tree_17_leaf_6
280 Agregando campo: tree_17_leaf_7
281 Agregando campo: tree_17_leaf_8
282 Agregando campo: tree_17_leaf_9
283 Agregando campo: tree_17_leaf_10
284 Agregando campo: tree_17_leaf_11


  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummie

285 Agregando campo: tree_17_leaf_12
286 Agregando campo: tree_17_leaf_13
287 Agregando campo: tree_17_leaf_14
288 Agregando campo: tree_17_leaf_15
289 Agregando campo: tree_18_leaf_0
290 Agregando campo: tree_18_leaf_1
291 Agregando campo: tree_18_leaf_2
292 Agregando campo: tree_18_leaf_3
293 Agregando campo: tree_18_leaf_4
294 Agregando campo: tree_18_leaf_5
295 Agregando campo: tree_18_leaf_6
296 Agregando campo: tree_18_leaf_7
297 Agregando campo: tree_18_leaf_8
298 Agregando campo: tree_18_leaf_9
299 Agregando campo: tree_18_leaf_10
300 Agregando campo: tree_18_leaf_11
301 Agregando campo: tree_18_leaf_12
302 Agregando campo: tree_18_leaf_13
303 Agregando campo: tree_18_leaf_14
304 Agregando campo: tree_18_leaf_15
305 Agregando campo: tree_19_leaf_0
306 Agregando campo: tree_19_leaf_1
307 Agregando campo: tree_19_leaf_2
308 Agregando campo: tree_19_leaf_3
309 Agregando campo: tree_19_leaf_4
310 Agregando campo: tree_19_leaf_5
311 Agregando campo: tree_19_leaf_6
312 Agregando camp

  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]
  data_extended[campo]=leaf_dummies_df[campo]


## Ver resultado

In [24]:
num_variables_originales=data_original.shape[1]
num_variables_tree = len([col for col in data_extended.columns if col.startswith('tree')])
num_variables_extended=data_extended.shape[1]
print (f'Cant de variables originales: {num_variables_originales}')
print (f'Cant de variables tree: {num_variables_tree}')
print (f'Cant de variables extended: {num_variables_extended}')
print(f'Archivo a grabar: {output_file}')

Cant de variables originales: 1083
Cant de variables tree: 320
Cant de variables extended: 1403
Archivo a grabar: k2_aumentada_conRF.parquet.gz


In [25]:
# Filtra las columnas que empiezan con 'tree'
tree_columns = [col for col in data_extended.columns if col.startswith('tree')]

# Calcula la suma para cada registro en estas columnas
suma_tree = data_extended[tree_columns].sum(axis=1)

# Muestra el resultado
print(suma_tree)

0         20.0
1         20.0
2         20.0
3         20.0
4         20.0
          ... 
375304    20.0
375305    20.0
375306    20.0
375307    20.0
375308    20.0
Length: 375309, dtype: float64


In [26]:
contabilizar(data_extended, 'Datos extendidos con RF')


Contabilización de la base Datos extendidos con RF


Shape: (375309, 1403)

\ por mes: 
foto_mes
202108    165442
202104     18363
202105     18352
202106     18271
202103     18169
202102     17886
202101     17481
202012     17307
202011     17097
202010     16921
202009     16771
202007     16629
202008     16620
Name: count, dtype: int64


Filas por mes y clase: 
clase_ternaria  BAJA+1  BAJA+2  CONTINUA
foto_mes                                
202007             627     542     15460
202008             544     472     15604
202009             474     564     15733
202010             565     488     15868
202011             490     646     15961
202012             649     634     16024
202101             635     785     16061
202102             785    1017     16084
202103            1020     981     16168
202104             982    1189     16192
202105            1189     911     16252
202106             908    1074     16289
202108          165442       0         0



## Grabar salida

In [27]:
data_original.dtypes

numero_de_cliente                 int64
foto_mes                          int64
active_quarter                    int64
cliente_vip                       int64
internet                          int64
                                 ...   
Tarjeta_mpagominimo_n10_lag1    float64
Tarjeta_mpagominimo_n10_lag2    float64
clase_peso                      float64
clase_binaria1                    int64
clase_binaria2                    int64
Length: 1083, dtype: object

In [28]:
data_extended.dtypes

numero_de_cliente      int64
foto_mes               int64
active_quarter         int64
cliente_vip            int64
internet               int64
                      ...   
tree_19_leaf_11      float64
tree_19_leaf_12      float64
tree_19_leaf_13      float64
tree_19_leaf_14      float64
tree_19_leaf_15      float64
Length: 1403, dtype: object

In [None]:
# Grabar el archivo
if output_file.endswith('.gz'):
    data_extended.to_parquet(os.path.join(dataset_path, output_file), index=False, compression='gzip')
else:
    data_extended.to_parquet(os.path.join(dataset_path, output_file), index=False)



In [None]:
print (output_file)