In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer, MeanMedianImputer, CategoricalImputer
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders.woe import WOEEncoder
from category_encoders import TargetEncoder

  from pandas.core import (


In [2]:
df = pd.read_csv('df_trainTest.csv')

In [3]:
df['flg_4k'] = np.where(df['INGRESONETOTARGET_PROMULT03M'] >= 4000, 1, 0)
df['flg_8k'] = np.where(df['INGRESONETOTARGET_PROMULT03M'] >= 8000, 1, 0)

In [4]:
df.shape

(333632, 60)

In [5]:
df.sample(2)

Unnamed: 0,PERIODO,COD_DNI,INGRESONETOTARGET_ULT01M,INGRESONETOTARGET_PROMULT03M,MESESCONINGRESOS,FLG_ATIPICO,FLG_INMUEBLE,NRO_INMUEBLES,SALDOPROPIEDAD_MAX,SALDOHIPOTECARIO_MAX,...,FLG_SOLOSIS,MARCAANALISIS,SEGMENTO,PESOVIVIENDA,PESOPROPIEDADES,PESOUNIVERSIDAD,PESOCARRERA,PESOEXPERIENCIAPROFESIONAL,flg_4k,flg_8k
179386,202409,41385735,3869.544397,3869.544397,3,0,1,4,321431.96,321431.96,...,0.0,trainTest,Alto,3879,5227,4462,6415,4845,0,0
234991,202405,27993721,2377.295,2377.295,3,0,0,0,0.0,0.0,...,0.0,trainTest,Bajo,3879,3475,2584,2584,2584,0,0


In [6]:
pd.set_option('display.max_columns', None)

In [7]:
df.columns

Index(['PERIODO', 'COD_DNI', 'INGRESONETOTARGET_ULT01M',
       'INGRESONETOTARGET_PROMULT03M', 'MESESCONINGRESOS', 'FLG_ATIPICO',
       'FLG_INMUEBLE', 'NRO_INMUEBLES', 'SALDOPROPIEDAD_MAX',
       'SALDOHIPOTECARIO_MAX', 'ESTRATO', 'FLG_ESTRATO5', 'FLG_ESTRATO4',
       'FLG_ESTRATO3', 'FLG_ESTRATO2', 'FLG_ESTRATO1', 'FLG_LIMAMODERNA',
       'PROVINCIADISTRITO', 'GRUPO_DEPARTAMENTO', 'FLG_GRUPO_DEPARTAMENTO5',
       'FLG_GRUPO_DEPARTAMENTO4', 'FLG_GRUPO_DEPARTAMENTO3',
       'FLG_GRUPO_DEPARTAMENTO2', 'FLG_GRUPO_DEPARTAMENTO1', 'FLG_LIMACALLAO',
       'PRECIOPROMEDIOINMUEBLE', 'FLG_CONSULTADOSUNEDU',
       'FLG_REPRESENTANTELEGAL', 'FLG_SUNEDU', 'INSTITUCION_PREGRADO',
       'GRUPO_UNIVERSIDAD_PRESTIGIO', 'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO5',
       'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO4', 'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO3',
       'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO2', 'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO1',
       'FLG_MAESTRIA', 'FLG_ESPECIALIZACION',
       'EXPERIENCIAPROFESIONALENCOSE

In [8]:
llave = ['PERIODO', 'COD_DNI', 'INGRESONETOTARGET_ULT01M','MESESCONINGRESOS', 'FLG_ATIPICO'
         ,'INGRESONETOTARGET_PROMULT03M','MARCAANALISIS','INGRESO_PLD','SEGMENTO','PRESEGMENTO']

In [9]:
flgs = ['FLG_INMUEBLE','FLG_LIMAMODERNA','FLG_LIMACALLAO',
        'FLG_GRUPO_DEPARTAMENTO5','FLG_GRUPO_DEPARTAMENTO4', 'FLG_GRUPO_DEPARTAMENTO3',
        'FLG_GRUPO_DEPARTAMENTO2', 'FLG_GRUPO_DEPARTAMENTO1',
        'FLG_ESTRATO5', 'FLG_ESTRATO4','FLG_ESTRATO3', 'FLG_ESTRATO2', 'FLG_ESTRATO1',
        'FLG_REPRESENTANTELEGAL', 'FLG_SUNEDU','FLG_MAESTRIA','FLG_ESPECIALIZACION',
        'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO5', 'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO4',
        'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO3', 'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO2',
        'FLG_GRUPO_UNIVERSIDAD_PRESTIGIO1',
        'FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA5','FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA4',
        'FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA3','FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA2',
        'FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA1','FLG_SOLOSIS'
        ]

In [10]:
num = ['NRO_INMUEBLES', 'SALDOPROPIEDAD_MAX','SALDOHIPOTECARIO_MAX','PRECIOPROMEDIOINMUEBLE','EXPERIENCIAPROFESIONALENCOSECHA',
       'INGRESOMICARRERA', 'EDADENCOSECHA', 'INGRESO_PLD','PESOVIVIENDA', 'PESOPROPIEDADES', 'PESOUNIVERSIDAD',
       'PESOCARRERA', 'PESOEXPERIENCIAPROFESIONAL',]

In [11]:
cat = ['ESTRATO','PROVINCIADISTRITO', 'GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO',
       'CARRERA_PREGRADOHOMOLOGADA', 'GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']

# 1. Dividir Train y Test

In [12]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)

In [13]:
df_train.shape, df_test.shape

((250224, 60), (83408, 60))

# 2. Tratamiento de nulos

In [14]:
pipe_nulos = Pipeline(
    [
        (
            "arbitrary_imputer",
            ArbitraryNumberImputer(arbitrary_number=0, variables=['SALDOPROPIEDAD_MAX','SALDOHIPOTECARIO_MAX']),
        ),
        (
            "arbitrary_imputer2",
            ArbitraryNumberImputer(arbitrary_number=1, variables=['FLG_LIMACALLAO','FLG_SOLOSIS']),
        ),
        (
            "median_imputer",
            MeanMedianImputer(imputation_method="median", variables=['EXPERIENCIAPROFESIONALENCOSECHA','EDADENCOSECHA']),
        ),
    ]
)
pipe_nulos.fit(df_train)

In [15]:
pipe_nulos.named_steps["arbitrary_imputer"].imputer_dict_

{'SALDOPROPIEDAD_MAX': 0, 'SALDOHIPOTECARIO_MAX': 0}

In [16]:
pipe_nulos.named_steps["arbitrary_imputer2"].imputer_dict_

{'FLG_LIMACALLAO': 1, 'FLG_SOLOSIS': 1}

In [17]:
pipe_nulos.named_steps["median_imputer"].imputer_dict_

{'EXPERIENCIAPROFESIONALENCOSECHA': 10.0, 'EDADENCOSECHA': 48.0}

In [18]:
df_train = pipe_nulos.transform(df_train)
df_test = pipe_nulos.transform(df_test)

In [19]:
df_train.shape, df_test.shape

((250224, 60), (83408, 60))

# 3. Categóricas

In [20]:
null_values = pd.concat([df_train[cat].isnull().sum(), df_train[cat].isnull().sum() / len(df_train[cat]),df_train[cat].nunique()], axis = 1)
null_values.rename(columns = {0: 'number_null_values',1: 'ratio_null_values',2: 'distinct_values'}, inplace = True)
null_values

Unnamed: 0,number_null_values,ratio_null_values,distinct_values
ESTRATO,0,0.0,5
PROVINCIADISTRITO,0,0.0,205
GRUPO_DEPARTAMENTO,0,0.0,5
INSTITUCION_PREGRADO,0,0.0,89
GRUPO_UNIVERSIDAD_PRESTIGIO,0,0.0,6
CARRERA_PREGRADOHOMOLOGADA,0,0.0,54
GRUPO_CARRERA_PREGRADOHOMOLOGADA,0,0.0,6
PRESEGMENTO,0,0.0,4
SEGMENTO,0,0.0,3


In [21]:
pipe_woe_encoder_4k = Pipeline(
    [
        (
            "woe_encoder",
            WOEEncoder(cols=['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO'],)
        ),
    ]
)

pipe_woe_encoder_4k.fit(df_train[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']], df_train['flg_4k'])

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

In [22]:
df_train_woe4k = pipe_woe_encoder_4k.transform(df_train[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_train_woe4k = df_train_woe4k.add_suffix('_woe4k')
df_train = pd.concat([df_train, df_train_woe4k], axis=1)

df_test_woe4k = pipe_woe_encoder_4k.transform(df_test[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_test_woe4k = df_test_woe4k.add_suffix('_woe4k')
df_test = pd.concat([df_test, df_test_woe4k], axis=1)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [23]:
df_train.shape, df_test.shape

((250224, 69), (83408, 69))

In [24]:
pipe_woe_encoder_8k = Pipeline(
    [
        (
            "woe_encoder",
            WOEEncoder(cols=['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO'],)
        ),
    ]
)

pipe_woe_encoder_8k.fit(df_train[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']], df_train['flg_8k'])

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

In [25]:
df_train_woe8k = pipe_woe_encoder_8k.transform(df_train[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_train_woe8k = df_train_woe8k.add_suffix('_woe8k')
df_train = pd.concat([df_train, df_train_woe8k], axis=1)

df_test_woe8k = pipe_woe_encoder_8k.transform(df_test[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_test_woe8k = df_test_woe8k.add_suffix('_woe8k')
df_test = pd.concat([df_test, df_test_woe8k], axis=1)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [26]:
df_train.shape, df_test.shape

((250224, 78), (83408, 78))

In [27]:
pipe_target_encoder = Pipeline(
    [
        (
            "target_encoder",
            TargetEncoder(cols=['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO'])
        ),
    ]
)

pipe_target_encoder.fit(df_train[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']], df_train['INGRESONETOTARGET_PROMULT03M'])

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

In [28]:
df_train_target = pipe_target_encoder.transform(df_train[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_train_target = df_train_target.add_suffix('_target')
df_train = pd.concat([df_train, df_train_target], axis=1)

df_test_target = pipe_target_encoder.transform(df_test[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                            ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_test_target = df_test_target.add_suffix('_target')
df_test = pd.concat([df_test, df_test_target], axis=1)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


# 4. Numéricas

In [29]:
null_values = pd.concat([df_train[num].isnull().sum(), df_train[num].isnull().sum() / len(df_train[num])], axis = 1)
null_values.rename(columns = {0: 'number_null_values',1: 'ratio_null_values'}, inplace = True)
null_values

Unnamed: 0,number_null_values,ratio_null_values
NRO_INMUEBLES,0,0.0
SALDOPROPIEDAD_MAX,0,0.0
SALDOHIPOTECARIO_MAX,0,0.0
PRECIOPROMEDIOINMUEBLE,0,0.0
EXPERIENCIAPROFESIONALENCOSECHA,0,0.0
INGRESOMICARRERA,0,0.0
EDADENCOSECHA,0,0.0
INGRESO_PLD,0,0.0
PESOVIVIENDA,0,0.0
PESOPROPIEDADES,0,0.0


In [30]:
df_train.shape, df_test.shape

((250224, 87), (83408, 87))

In [31]:
pipe_norm = Pipeline(
    [
        (   "normalizer",
            ColumnTransformer(transformers=[('normalized', MinMaxScaler(), ['NRO_INMUEBLES','SALDOPROPIEDAD_MAX','SALDOHIPOTECARIO_MAX','PRECIOPROMEDIOINMUEBLE',
                                                                            'EXPERIENCIAPROFESIONALENCOSECHA','INGRESOMICARRERA','EDADENCOSECHA','INGRESO_PLD',
                                                                            'ESTRATO_target','PROVINCIADISTRITO_target','GRUPO_DEPARTAMENTO_target','INSTITUCION_PREGRADO_target',
                                                                            'GRUPO_UNIVERSIDAD_PRESTIGIO_target','CARRERA_PREGRADOHOMOLOGADA_target','GRUPO_CARRERA_PREGRADOHOMOLOGADA_target',
                                                                            'PRESEGMENTO_target','SEGMENTO_target','PESOVIVIENDA','PESOPROPIEDADES','PESOUNIVERSIDAD','PESOCARRERA',
                                                                            'PESOEXPERIENCIAPROFESIONAL'])]
                              ,remainder='drop',verbose_feature_names_out=False).set_output(transform="pandas")
        ),
    ]
)
pipe_norm.fit(df_train)

In [32]:
df_train_norm = pipe_norm.transform(df_train)
df_train_norm = df_train_norm.add_suffix('_normalized')

df_test_norm = pipe_norm.transform(df_test)
df_test_norm = df_test_norm.add_suffix('_normalized')

In [33]:
df_train = pd.concat([df_train, df_train_norm], axis=1)
df_test = pd.concat([df_test, df_test_norm], axis=1)

In [34]:
df_train.shape, df_test.shape

((250224, 109), (83408, 109))

In [35]:
pipe_stand = Pipeline(
    [
        (   "standardizer",
            ColumnTransformer(transformers=[('standardized', StandardScaler(), ['NRO_INMUEBLES','SALDOPROPIEDAD_MAX','SALDOHIPOTECARIO_MAX','PRECIOPROMEDIOINMUEBLE',
                                                                            'EXPERIENCIAPROFESIONALENCOSECHA','INGRESOMICARRERA','EDADENCOSECHA','INGRESO_PLD',
                                                                            'ESTRATO_target','PROVINCIADISTRITO_target','GRUPO_DEPARTAMENTO_target','INSTITUCION_PREGRADO_target',
                                                                            'GRUPO_UNIVERSIDAD_PRESTIGIO_target','CARRERA_PREGRADOHOMOLOGADA_target','GRUPO_CARRERA_PREGRADOHOMOLOGADA_target',
                                                                            'PRESEGMENTO_target','SEGMENTO_target','PESOVIVIENDA','PESOPROPIEDADES','PESOUNIVERSIDAD','PESOCARRERA',
                                                                            'PESOEXPERIENCIAPROFESIONAL'])]
                              ,remainder='drop',verbose_feature_names_out=False).set_output(transform="pandas")
        ),
    ]
)
pipe_stand.fit(df_train)

In [36]:
df_train_stand = pipe_stand.transform(df_train)
df_train_stand = df_train_stand.add_suffix('_standardized')

df_test_stand = pipe_stand.transform(df_test)
df_test_stand = df_test_stand.add_suffix('_standardized')

In [37]:
df_train = pd.concat([df_train, df_train_stand], axis=1)
df_test = pd.concat([df_test, df_test_stand], axis=1)

## 5 Flags

In [38]:
null_values = pd.concat([df_train[flgs].isnull().sum(), df_train[flgs].isnull().sum() / len(df_train[flgs])], axis = 1)
null_values.rename(columns = {0: 'number_null_values',1: 'ratio_null_values'}, inplace = True)
null_values

Unnamed: 0,number_null_values,ratio_null_values
FLG_INMUEBLE,0,0.0
FLG_LIMAMODERNA,0,0.0
FLG_LIMACALLAO,0,0.0
FLG_GRUPO_DEPARTAMENTO5,0,0.0
FLG_GRUPO_DEPARTAMENTO4,0,0.0
FLG_GRUPO_DEPARTAMENTO3,0,0.0
FLG_GRUPO_DEPARTAMENTO2,0,0.0
FLG_GRUPO_DEPARTAMENTO1,0,0.0
FLG_ESTRATO5,0,0.0
FLG_ESTRATO4,0,0.0


In [39]:
df_train.sample(2)

Unnamed: 0,PERIODO,COD_DNI,INGRESONETOTARGET_ULT01M,INGRESONETOTARGET_PROMULT03M,MESESCONINGRESOS,FLG_ATIPICO,FLG_INMUEBLE,NRO_INMUEBLES,SALDOPROPIEDAD_MAX,SALDOHIPOTECARIO_MAX,ESTRATO,FLG_ESTRATO5,FLG_ESTRATO4,FLG_ESTRATO3,FLG_ESTRATO2,FLG_ESTRATO1,FLG_LIMAMODERNA,PROVINCIADISTRITO,GRUPO_DEPARTAMENTO,FLG_GRUPO_DEPARTAMENTO5,FLG_GRUPO_DEPARTAMENTO4,FLG_GRUPO_DEPARTAMENTO3,FLG_GRUPO_DEPARTAMENTO2,FLG_GRUPO_DEPARTAMENTO1,FLG_LIMACALLAO,PRECIOPROMEDIOINMUEBLE,FLG_CONSULTADOSUNEDU,FLG_REPRESENTANTELEGAL,FLG_SUNEDU,INSTITUCION_PREGRADO,GRUPO_UNIVERSIDAD_PRESTIGIO,FLG_GRUPO_UNIVERSIDAD_PRESTIGIO5,FLG_GRUPO_UNIVERSIDAD_PRESTIGIO4,FLG_GRUPO_UNIVERSIDAD_PRESTIGIO3,FLG_GRUPO_UNIVERSIDAD_PRESTIGIO2,FLG_GRUPO_UNIVERSIDAD_PRESTIGIO1,FLG_MAESTRIA,FLG_ESPECIALIZACION,EXPERIENCIAPROFESIONALENCOSECHA,CARRERA_PREGRADOHOMOLOGADA,GRUPO_CARRERA_PREGRADOHOMOLOGADA,FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA5,FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA4,FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA3,FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA2,FLG_GRUPO_CARRERA_PREGRADOHOMOLOGADA1,INGRESOMICARRERA,EDADENCOSECHA,INGRESO_PLD,PRESEGMENTO,FLG_SOLOSIS,MARCAANALISIS,SEGMENTO,PESOVIVIENDA,PESOPROPIEDADES,PESOUNIVERSIDAD,PESOCARRERA,PESOEXPERIENCIAPROFESIONAL,flg_4k,flg_8k,ESTRATO_woe4k,PROVINCIADISTRITO_woe4k,GRUPO_DEPARTAMENTO_woe4k,INSTITUCION_PREGRADO_woe4k,GRUPO_UNIVERSIDAD_PRESTIGIO_woe4k,CARRERA_PREGRADOHOMOLOGADA_woe4k,GRUPO_CARRERA_PREGRADOHOMOLOGADA_woe4k,PRESEGMENTO_woe4k,SEGMENTO_woe4k,ESTRATO_woe8k,PROVINCIADISTRITO_woe8k,GRUPO_DEPARTAMENTO_woe8k,INSTITUCION_PREGRADO_woe8k,GRUPO_UNIVERSIDAD_PRESTIGIO_woe8k,CARRERA_PREGRADOHOMOLOGADA_woe8k,GRUPO_CARRERA_PREGRADOHOMOLOGADA_woe8k,PRESEGMENTO_woe8k,SEGMENTO_woe8k,ESTRATO_target,PROVINCIADISTRITO_target,GRUPO_DEPARTAMENTO_target,INSTITUCION_PREGRADO_target,GRUPO_UNIVERSIDAD_PRESTIGIO_target,CARRERA_PREGRADOHOMOLOGADA_target,GRUPO_CARRERA_PREGRADOHOMOLOGADA_target,PRESEGMENTO_target,SEGMENTO_target,NRO_INMUEBLES_normalized,SALDOPROPIEDAD_MAX_normalized,SALDOHIPOTECARIO_MAX_normalized,PRECIOPROMEDIOINMUEBLE_normalized,EXPERIENCIAPROFESIONALENCOSECHA_normalized,INGRESOMICARRERA_normalized,EDADENCOSECHA_normalized,INGRESO_PLD_normalized,ESTRATO_target_normalized,PROVINCIADISTRITO_target_normalized,GRUPO_DEPARTAMENTO_target_normalized,INSTITUCION_PREGRADO_target_normalized,GRUPO_UNIVERSIDAD_PRESTIGIO_target_normalized,CARRERA_PREGRADOHOMOLOGADA_target_normalized,GRUPO_CARRERA_PREGRADOHOMOLOGADA_target_normalized,PRESEGMENTO_target_normalized,SEGMENTO_target_normalized,PESOVIVIENDA_normalized,PESOPROPIEDADES_normalized,PESOUNIVERSIDAD_normalized,PESOCARRERA_normalized,PESOEXPERIENCIAPROFESIONAL_normalized,NRO_INMUEBLES_standardized,SALDOPROPIEDAD_MAX_standardized,SALDOHIPOTECARIO_MAX_standardized,PRECIOPROMEDIOINMUEBLE_standardized,EXPERIENCIAPROFESIONALENCOSECHA_standardized,INGRESOMICARRERA_standardized,EDADENCOSECHA_standardized,INGRESO_PLD_standardized,ESTRATO_target_standardized,PROVINCIADISTRITO_target_standardized,GRUPO_DEPARTAMENTO_target_standardized,INSTITUCION_PREGRADO_target_standardized,GRUPO_UNIVERSIDAD_PRESTIGIO_target_standardized,CARRERA_PREGRADOHOMOLOGADA_target_standardized,GRUPO_CARRERA_PREGRADOHOMOLOGADA_target_standardized,PRESEGMENTO_target_standardized,SEGMENTO_target_standardized,PESOVIVIENDA_standardized,PESOPROPIEDADES_standardized,PESOUNIVERSIDAD_standardized,PESOCARRERA_standardized,PESOEXPERIENCIAPROFESIONAL_standardized
135767,202412,40914887,7603.625,7603.625,3,0,0,0,0.0,0.0,5,1,0,0,0,0,0,Lima-Los Olivos,5,1,0,0,0,0,1,1987.875,1,0,1,UNIVERSIDAD DE SAN MARTIN DE PORRES,5,1,0,0,0,0,0,0,17.0,NEGOCIOS INTERNACIONALES,3,0,0,1,0,0,2930.0,43,6740.839001,2,0.0,trainTest,Medio,3879,3475,5262,4080,4845,1,0,0.375942,0.340519,0.284794,1.167298,1.08018,-0.008905,0.487053,0.535819,0.35413,0.552885,0.025191,0.354431,0.870159,0.983449,-0.243415,0.079438,0.414799,-0.097017,4515.74976,4221.078403,4298.482888,5300.483869,5363.680242,3852.838486,4323.839572,4630.283303,4274.639484,0.0,0.0,0.0,0.298156,0.326923,0.367629,0.408163,0.204462,1.0,0.460255,1.0,0.477569,1.0,0.301694,0.595199,0.581859,0.478501,0.403252,0.0,1.0,0.542426,1.0,-0.182959,-0.293374,-0.273427,1.169077,0.532525,0.203201,-0.430501,1.253115,1.204433,0.310917,1.042905,1.265944,1.448299,-0.129004,0.305749,0.658339,0.254803,0.051088,-0.679143,1.422776,0.25932,1.080878
80887,202308,43326816,997.70759,2364.765878,3,1,1,1,82904.04,82904.04,5,1,0,0,0,0,0,Lima-Ate,5,1,0,0,0,0,1,1987.875,1,0,1,UNIVERSIDAD INCA GARCILASO DE LA VEGA ASOCIACI...,1,0,0,0,0,1,1,1,12.0,ENFERMERIA,2,0,0,0,1,0,3538.565217,36,3238.701891,2,0.0,trainTest,Medio,3879,3871,3864,5388,4032,0,0,0.375942,0.013765,0.284794,0.274886,-0.067522,1.196684,0.699444,0.535819,0.35413,0.552885,-0.04301,0.354431,-0.135339,-0.486911,0.81008,0.372012,0.414799,-0.097017,4515.74976,3792.458446,4298.482888,4207.730209,3908.582161,5278.128612,4705.172185,4630.283303,4274.639484,0.001383,0.014273,0.014273,0.298156,0.230769,0.443986,0.265306,0.080124,1.0,0.336083,1.0,0.276332,0.452247,0.677029,0.735594,0.581859,0.478501,0.403252,0.226027,0.477969,0.798746,0.640425,-0.00176,0.472263,0.492717,1.169077,0.053072,0.498297,-1.065213,-0.127097,1.204433,-0.307058,1.042905,0.19803,-0.102895,1.078553,0.671543,0.658339,0.254803,0.051088,0.022415,-0.052958,1.231271,0.176228


In [40]:
df_train.shape, df_test.shape

((250224, 131), (83408, 131))

# Trabajar variables oot y gris

In [41]:
df_oot = pd.read_csv('df_oot.csv')
df_gris = pd.read_csv('df_gris.csv')

In [42]:
df_oot['flg_4k'] = np.where(df_oot['INGRESONETOTARGET_PROMULT03M'] >= 4000, 1, 0)
df_oot['flg_8k'] = np.where(df_oot['INGRESONETOTARGET_PROMULT03M'] >= 8000, 1, 0)

df_gris['flg_4k'] = np.where(df_gris['INGRESONETOTARGET_PROMULT03M'] >= 4000, 1, 0)
df_gris['flg_8k'] = np.where(df_gris['INGRESONETOTARGET_PROMULT03M'] >= 8000, 1, 0)

In [43]:
df_oot = pipe_nulos.transform(df_oot)
df_gris = pipe_nulos.transform(df_gris)

In [44]:
df_oot_woe4k = pipe_woe_encoder_4k.transform(df_oot[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_oot_woe4k = df_oot_woe4k.add_suffix('_woe4k')
df_oot = pd.concat([df_oot, df_oot_woe4k], axis=1)

df_gris_woe4k = pipe_woe_encoder_4k.transform(df_gris[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_gris_woe4k = df_gris_woe4k.add_suffix('_woe4k')
df_gris = pd.concat([df_gris, df_gris_woe4k], axis=1)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [45]:
df_oot_woe8k = pipe_woe_encoder_8k.transform(df_oot[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_oot_woe8k = df_oot_woe8k.add_suffix('_woe8k')
df_oot = pd.concat([df_oot, df_oot_woe8k], axis=1)

df_gris_woe8k = pipe_woe_encoder_8k.transform(df_gris[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                                         ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_gris_woe8k = df_gris_woe8k.add_suffix('_woe8k')
df_gris = pd.concat([df_gris, df_gris_woe8k], axis=1)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [46]:
df_oot_target = pipe_target_encoder.transform(df_oot[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_oot_target = df_oot_target.add_suffix('_target')
df_oot = pd.concat([df_oot, df_oot_target], axis=1)

df_gris_target = pipe_target_encoder.transform(df_gris[['ESTRATO','PROVINCIADISTRITO','GRUPO_DEPARTAMENTO','INSTITUCION_PREGRADO','GRUPO_UNIVERSIDAD_PRESTIGIO'
                                ,'CARRERA_PREGRADOHOMOLOGADA','GRUPO_CARRERA_PREGRADOHOMOLOGADA','PRESEGMENTO','SEGMENTO']])
df_gris_target = df_gris_target.add_suffix('_target')
df_gris = pd.concat([df_gris, df_gris_target], axis=1)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [47]:
df_oot_norm = pipe_norm.transform(df_oot)
df_oot_norm = df_oot_norm.add_suffix('_normalized')

df_gris_norm = pipe_norm.transform(df_gris)
df_gris_norm = df_gris_norm.add_suffix('_normalized')

In [48]:
df_oot = pd.concat([df_oot, df_oot_norm], axis=1)
df_gris = pd.concat([df_gris, df_gris_norm], axis=1)

In [49]:
df_oot_stand = pipe_stand.transform(df_oot)
df_oot_stand = df_oot_stand.add_suffix('_standardized')

df_gris_stand = pipe_stand.transform(df_gris)
df_gris_stand = df_gris_stand.add_suffix('_standardized')

In [50]:
df_oot = pd.concat([df_oot, df_oot_stand], axis=1)
df_gris = pd.concat([df_gris, df_gris_stand], axis=1)

In [51]:
df_oot.shape, df_gris.shape

((229136, 131), (311515, 131))

In [52]:
df_train.to_csv('df_train_t.csv', index=False)
df_test.to_csv('df_test_t.csv', index=False)
df_oot.to_csv('df_oot_t.csv', index=False)
df_gris.to_csv('df_gris_t.csv', index=False)

# Guardamos Pipelines

In [53]:
import joblib

In [54]:
joblib.dump(pipe_nulos, "pipe_nulos.joblib")
joblib.dump(pipe_woe_encoder_4k, "pipe_woe_encoder_4k.joblib")
joblib.dump(pipe_woe_encoder_8k, "pipe_woe_encoder_8k.joblib")
joblib.dump(pipe_target_encoder, "pipe_target_encoder.joblib")
joblib.dump(pipe_norm, "pipe_norm.joblib")
joblib.dump(pipe_stand, "pipe_stand.joblib")

['pipe_stand.joblib']