In [2]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

# *utils*

In [3]:
def get_cols(df: pd.DataFrame) -> tuple:
    numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
    categorical_columns = df.select_dtypes(exclude=np.number).columns.tolist()

    return numeric_columns, categorical_columns

zona_departamento = {
    'Norte': ['LAMBAYEQUE', 'CAJAMARCA', 'PIURA', 'TUMBES', 'LA LIBERTAD', 'AMAZONAS', 'SAN MARTIN', 'LORETO'],
    'Centro': ['JUNIN', 'ANCASH', 'HUANUCO', 'HUANCAVELICA', 'PASCO', 'AYACUCHO', 'UCAYALI'],
    'Sur': ['AREQUIPA', 'MOQUEGUA', 'TACNA', 'CUSCO', 'PUNO', 'MADRE DE DIOS', 'ICA', 'APURIMAC', 'LIMA', 'CALLAO']
}
departamento_a_region = { v: k for k, values in zona_departamento.items() for v in values}

def asignar_region(departamento):
    return departamento_a_region.get(departamento, 'DESCONOCIDO')

# *balances*

In [None]:
df_balances = (
    pd.read_csv('../data/raw/balances.csv')
)
df_balances['PERIODO'] = df_balances['PERIODO'].astype(str)

"""
PERIODO - Periodo de extracción de los datos
ID - Identificador único

CANT_EMP_NEG - Cantidad de empresas de negocio
CANT_EMP_CONS - Cantidad de empresas de consumo
CANT_EMP_HIPOT - Cantidad de empresas hipotecario
    * last
    * variable para ver si cerro o abrio empresas en los ultimos 9 meses

SALDO_MED_EMP - Saldo en mediana empresa
SALDO_PEQ_EMP - Saldo en pequeña empresa
SALDO_MIC_EMP - Saldo en micro empresa
    * categorica para saber que tipo de empresa tiene
    * last
    * variacion promedio de los saldos en los saldos que tiene

SALDO_CONS_REV - Saldo Consumo revolvente
SALDO_CONS_NO_REV - Saldo Consumo NO revolvente
    
SALDO_HIPOT - Saldo en hipotecario
SALDO_VENCIDO - Saldo vencido

CANT_EMP_DOL_NEG - Cantidad de empresas de negocios en dólares
SALDO_DOLA_NEG - Saldo en dólares de negocio
CANT_EMP_DOL_CONS - Cantidad de empresas de consumo en dólares
SALDO_DOLA_CONS - Saldo en dólares de consumo
CANT_EMP_DOL_HIPOT - Cantidad de empresas de hipotecario en dólares
SALDO_DOLA_HIPOT - Saldo en dólares de hipotecario
MAX_LINEA_DISP_U6M - Línea Máxima disponible en los últimos 6 meses
"""

In [None]:
df_balances['CANT_EMP_DOL'] = df_balances['CANT_EMP_DOL_NEG'] + df_balances['CANT_EMP_DOL_CONS'] + df_balances['CANT_EMP_DOL_HIPOT']
df_balances['SALDO_DOLA_TOTAL'] = df_balances['SALDO_DOLA_NEG'] + df_balances['SALDO_DOLA_CONS'] + df_balances['SALDO_DOLA_HIPOT']
df_balances['SALDO_EMP_TOTAL'] = df_balances['SALDO_MED_EMP'] + df_balances['SALDO_PEQ_EMP'] + df_balances['SALDO_MIC_EMP'] + df_balances['SALDO_HIPOT']
df_balances['SALDO_EMP_TOTAL-VENCIDO'] = df_balances['SALDO_EMP_TOTAL'] - df_balances['SALDO_VENCIDO']
df_balances['SALDO_REV_NO_REV-VENCIDO'] = df_balances['SALDO_CONS_REV'] + df_balances['SALDO_CONS_NO_REV'] - df_balances['SALDO_VENCIDO']
df_balances['SALDO_TOTAL-VENCIDO'] = df_balances['SALDO_EMP_TOTAL'] + df_balances['SALDO_CONS_REV'] + df_balances['SALDO_CONS_NO_REV'] - df_balances['SALDO_VENCIDO']

df_balances['PROXY_MOROSIDAD_1'] = df_balances['SALDO_VENCIDO'] / df_balances['SALDO_EMP_TOTAL']
df_balances['PROXY_MOROSIDAD_2'] = df_balances['SALDO_VENCIDO'] / (df_balances['SALDO_EMP_TOTAL'] + df_balances['SALDO_CONS_REV'] + df_balances['SALDO_CONS_NO_REV'])
df_balances['PROXY_MOROSIDAD_3'] = df_balances['SALDO_VENCIDO'] / (df_balances['SALDO_CONS_REV'] + df_balances['SALDO_CONS_NO_REV'])

In [None]:
for col in ['SALDO_EMP_TOTAL', 'SALDO_EMP_TOTAL-VENCIDO', 'SALDO_REV_NO_REV-VENCIDO', 'SALDO_TOTAL-VENCIDO', 'MAX_LINEA_DISP_U6M', 'SALDO_DOLA_TOTAL']:
    print(col)
    df_balances[f'VAR_{col}_1'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.pct_change(periods=1))
    df_balances[f'VAR_{col}_MA3'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.pct_change(periods=1).rolling(window=3).mean())
    df_balances[f'VAR_{col}_MA6'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.pct_change(periods=1).rolling(window=6).mean())
    df_balances[f'VAR_{col}_MA8'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.pct_change(periods=1).rolling(window=8).mean())

    df_balances[f'DIFF_{col}_1'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.diff(periods=1))
    df_balances[f'DIFF_{col}_MA3'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.diff(periods=1).rolling(window=3).mean())
    df_balances[f'DIFF_{col}_MA6'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.diff(periods=1).rolling(window=6).mean())
    df_balances[f'DIFF_{col}_MA8'] = df_balances.groupby('ID')[f'{col}'].transform(lambda x: x.diff(periods=1).rolling(window=8).mean())
    
for col in ['SALDO_PEQ_EMP', 'SALDO_MED_EMP', 'SALDO_MIC_EMP'] + ['CANT_EMP_NEG', 'CANT_EMP_CONS', 'CANT_EMP_HIPOT']:
    print(col)
    df_balances[f'{col}_FLAG'] = df_balances[f'{col}'].apply(lambda x: True if x != 0 else False)

In [None]:
df_balances.to_parquet('../data/processed/balances.gzip', compression='gzip')

In [None]:
df_balances_unirow = df_balances.groupby('ID').agg(
    {
        'CANT_EMP_NEG': ['max', 'last', 'min'],
        'CANT_EMP_CONS': ['max', 'last', 'min'],
        'CANT_EMP_HIPOT': ['max', 'last', 'min'],
        
        'SALDO_MED_EMP' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_PEQ_EMP' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_MIC_EMP' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_CONS_REV' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_CONS_NO_REV' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_HIPOT' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_VENCIDO' : ['mean', 'median', 'last', 'min', 'max', 'first'],

        'CANT_EMP_DOL_NEG' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_DOLA_NEG' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'CANT_EMP_DOL_CONS' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_DOLA_CONS' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'CANT_EMP_DOL_HIPOT' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'SALDO_DOLA_HIPOT' : ['mean', 'median', 'last', 'min', 'max', 'first'],
        'MAX_LINEA_DISP_U6M' : ['mean', 'median', 'last', 'min', 'max', 'first'],

        'SALDO_EMP_TOTAL': ['mean', 'last', 'max', 'min', 'median'],
        
        'VAR_SALDO_EMP_TOTAL_1' : ['mean', 'median'], 
        'VAR_SALDO_EMP_TOTAL_MA3' : ['mean', 'median'], 
        'VAR_SALDO_EMP_TOTAL_MA6' : ['mean', 'median'],
        'VAR_SALDO_EMP_TOTAL_MA8' : ['mean', 'median'], 
        
        'VAR_SALDO_EMP_TOTAL-VENCIDO_1' : ['mean', 'median'],
        'VAR_SALDO_EMP_TOTAL-VENCIDO_MA3' : ['mean', 'median'],
        'VAR_SALDO_EMP_TOTAL-VENCIDO_MA6' : ['mean', 'median'],
        'VAR_SALDO_EMP_TOTAL-VENCIDO_MA8' : ['mean', 'median'],
        
        'DIFF_SALDO_EMP_TOTAL_1' : ['mean', 'median'],
        'DIFF_SALDO_EMP_TOTAL_MA3' : ['mean', 'median'],
        'DIFF_SALDO_EMP_TOTAL_MA6' : ['mean', 'median'],
        'DIFF_SALDO_EMP_TOTAL_MA8' : ['mean', 'median'],

        'DIFF_SALDO_EMP_TOTAL-VENCIDO_1' : ['mean', 'median'],
        'DIFF_SALDO_EMP_TOTAL-VENCIDO_MA3' : ['mean', 'median'],
        'DIFF_SALDO_EMP_TOTAL-VENCIDO_MA6' : ['mean', 'median'],
        'DIFF_SALDO_EMP_TOTAL-VENCIDO_MA8' : ['mean', 'median'],

        'DIFF_MAX_LINEA_DISP_U6M_1': ['mean', 'median'],
        'DIFF_MAX_LINEA_DISP_U6M_MA3': ['mean', 'median'],
        'DIFF_MAX_LINEA_DISP_U6M_MA6': ['mean', 'median'],
        'DIFF_MAX_LINEA_DISP_U6M_MA8': ['mean', 'median'],

        'VAR_SALDO_REV_NO_REV-VENCIDO_1' : ['mean', 'median'],
        'VAR_SALDO_REV_NO_REV-VENCIDO_MA3' : ['mean', 'median'],
        'VAR_SALDO_REV_NO_REV-VENCIDO_MA6' : ['mean', 'median'],
        'VAR_SALDO_REV_NO_REV-VENCIDO_MA8' : ['mean', 'median'],

        'VAR_SALDO_TOTAL-VENCIDO_1' : ['mean', 'median'],
        'VAR_SALDO_TOTAL-VENCIDO_MA3' : ['mean', 'median'],
        'VAR_SALDO_TOTAL-VENCIDO_MA6' : ['mean', 'median'],
        'VAR_SALDO_TOTAL-VENCIDO_MA8' : ['mean', 'median'],

        'VAR_MAX_LINEA_DISP_U6M_1': ['mean', 'median'],
        'VAR_MAX_LINEA_DISP_U6M_MA3': ['mean', 'median'],
        'VAR_MAX_LINEA_DISP_U6M_MA6': ['mean', 'median'],
        'VAR_MAX_LINEA_DISP_U6M_MA8': ['mean', 'median'],

        'VAR_SALDO_DOLA_TOTAL_1': ['mean', 'median'],
        'VAR_SALDO_DOLA_TOTAL_MA3': ['mean', 'median'],
        'VAR_SALDO_DOLA_TOTAL_MA6': ['mean', 'median'],
        'VAR_SALDO_DOLA_TOTAL_MA8': ['mean', 'median'],

        'DIFF_SALDO_DOLA_TOTAL_1': ['mean', 'median'],
        'DIFF_SALDO_DOLA_TOTAL_MA3': ['mean', 'median'],
        'DIFF_SALDO_DOLA_TOTAL_MA6': ['mean', 'median'],
        'DIFF_SALDO_DOLA_TOTAL_MA8': ['mean', 'median'],
        
        'SALDO_PEQ_EMP_FLAG' : ['any'],
        'SALDO_MED_EMP_FLAG' : ['any'],
        'SALDO_MIC_EMP_FLAG' : ['any'],
        'CANT_EMP_NEG_FLAG' : ['any'],
        'CANT_EMP_CONS_FLAG' : ['any'],
        'CANT_EMP_HIPOT_FLAG' : ['any'],

        'PROXY_MOROSIDAD_1': ['mean', 'median'],
        'PROXY_MOROSIDAD_2': ['mean', 'median'],
        'PROXY_MOROSIDAD_3': ['mean', 'median'],
    }
).replace([np.inf, -np.inf], np.nan).fillna(0)

df_balances_unirow.columns = ['_'.join(col) for col in df_balances_unirow.columns.values]
df_balances_unirow = df_balances_unirow.reset_index()

# *customers*

In [5]:
df_customers = (
    pd.read_csv('../data/raw/customers.csv')
    .assign(
        PER_BANCARIZACION = lambda row: row['PER_BANCARIZACION'].astype(str)
    )
    .assign(
        ANIO_BANCARIZACION = lambda row: row['PER_BANCARIZACION'].str[:4],
        MES_BANCARIZACION = lambda row: row['PER_BANCARIZACION'].str[5:7]
    )
    .assign(
        ANIO_BANCARIZACION = lambda row: row['ANIO_BANCARIZACION'].astype(int),
        MES_BANCARIZACION = lambda row: row['MES_BANCARIZACION'].astype(int)
    )
    .drop(columns=['PER_BANCARIZACION'])
)
df_customers['CO_TIPO_SEXO'] = df_customers['CO_TIPO_SEXO'].astype('category')

df_customers


Unnamed: 0,ID,CO_TIPO_SEXO,EDAD,NO_DEPARTAMENTO,NO_PROVINCIA,DE_CIIU,ANIO_BANCARIZACION,MES_BANCARIZACION
0,8bd775237f2f4c7cae87c255e9e71d62,1,48.000000,CAJAMARCA,CONTUMAZA,OTRAS ACTIVIDADES NO CLASIFICAD.EN OTRA PARTE\r\n,2013,7
1,2409b242893e4ea4b7e6af1292030666,3,,,,OTRAS ACTIVIDADES NO CLASIFICAD.EN OTRA PARTE\r\n,2017,9
2,65edb7eb9d144b2184ef76ec2c35cdf8,1,38.000000,,,OTRAS ACTIVIDADES NO CLASIFICAD.EN OTRA PARTE\r\n,2008,2
3,2fef711096234036807b01a5a697a81c,1,61.000000,TACNA,TACNA,"VTA. MIN. ALIMENTOS, BEBIDAS, TABACO.",2006,1
4,796443da390d409ba3e7b775fde576cd,3,29.000000,LAMBAYEQUE,CHICLAYO,OTRAS ACTIVID.DE TIPO SERVICIO NCP,2014,6
...,...,...,...,...,...,...,...,...
151967,54a73b9c515f46d19c0182e3f44f44fe,3,25.000000,,,VTA. MIN. PROD. FARMAC. Y ART. TOCADOR.,2021,8
151968,1df55d6b6af544b58312d68241a10e4f,3,32.000000,LAMBAYEQUE,CHICLAYO,TRANSPORTE DE CARGA POR CARRETERA.,2018,9
151969,5dd4a95d877d4474a277173c6fe4d227,1,62.000000,,,TRANSPORTE DE CARGA POR CARRETERA.,2006,1
151970,cbad53becdb042cfb98a992566ec0e94,2,51.000000,,,OTROS TIPOS DE VENTA AL POR MENOR.,2013,2


In [7]:
nc, cc = get_cols(df_customers)
cc.remove('ID')
cc.remove('DE_CIIU')

In [8]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## *Imputing `NO_PROVINCIA`*

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), cc),
        ('num', 'passthrough', nc)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean'))
])

df_customer_imputed = pipeline.fit_transform(df_customers)
df_costumer_imputed = pd.DataFrame(df_customer_imputed, columns=preprocessor.get_feature_names_out())


In [None]:
df_costumer_imputed.to_parquet("../data/raw/costumer_imputed_one_hot.gzip", compression='gzip')

In [None]:
# version inversa de one hot encoding en la columna de EDAD
df_customers_imputed = pd.read_parquet('../data/raw/costumer_imputed_one_hot.gzip')

nodpto_cols = [col for col in df_customers_imputed if 'cat__NO_DEPARTAMENTO' in col]

df_customers_imputed['departamento'] = df_customers_imputed[nodpto_cols].idxmax(axis=1).str.replace('cat__NO_DEPARTAMENTO_', '')
df_customers_imputed['departamento'].value_counts()

In [None]:
sns.kdeplot(df_customers['EDAD'], label='original', linewidth=2, color='blue', alpha=0.5)
sns.kdeplot(df_customers_imputed['num__EDAD'], label='catboost_imputed', linewidth=2, color='red', alpha=0.5)
plt.legend()
plt.show()

In [None]:
df_customers.loc[lambda df : (df['MES_BANCARIZACION']) < 1 | (df['MES_BANCARIZACION'] > 12), ['MES_BANCARIZACION']] = 1
df_customers['FECHA_BANCARIZACION'] = pd.to_datetime(df_customers['ANIO_BANCARIZACION'].astype(str) + df_customers['MES_BANCARIZACION'].astype(str).str.zfill(2), format='%Y%m')
df_customers['CO_TIPO_SEXO'] = df_customers['CO_TIPO_SEXO'].astype('category')
df_customers['NO_DEPARTAMENTO'] = df_customers['NO_DEPARTAMENTO'].replace('nan', np.nan)
df_customers['NO_DEPARTAMENTO'] = df_customers['NO_DEPARTAMENTO'].astype('category')

df_customers['MESES_HASTA_ACTUAL'] = (datetime.now() - df_customers['FECHA_BANCARIZACION']) // pd.Timedelta(days=30)      # astype('<m8[M]')
df_customers = df_customers.drop(columns=['FECHA_BANCARIZACION'])   # 'ANIO_BANCARIZACION', 'MES_BANCARIZACION', 
df_customers['EDAD'] = df_customers_imputed['num__EDAD']


In [None]:
df_customers['REGION'] = df_customers['NO_DEPARTAMENTO'].apply(asignar_region)

In [None]:
df_customers['EDAD'] = df_costumer_imputed['EDAD']
df_customers['ANIO_BANCARIZACION'] = df_costumer_imputed['ANIO_BANCARIZACION']
df_customers['MES_BANCARIZACION'] = df_costumer_imputed['MES_BANCARIZACION']

In [None]:
df_customers['CO_TIPO_SEXO'] = df_customers['CO_TIPO_SEXO'].astype('category')
df_customers['NO_DEPARTAMENTO'] = df_customers['NO_DEPARTAMENTO'].astype('category')
df_customers['NO_PROVINCIA'] = df_customers['NO_PROVINCIA'].astype('category')
df_customers['MES_BANCARIZACION'] = df_customers['MES_BANCARIZACION'].astype('int')
df_customers['ANIO_BANCARIZACION'] = df_customers['ANIO_BANCARIZACION'].astype('int')
df_customers.dtypes

In [None]:
df_customers = df_customers[df_customers['NO_PROVINCIA'] != 'PURUS']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

In [None]:
df_conocidos = df_customers.dropna(subset=['NO_PROVINCIA'])
df_desconocidos = df_customers[df_customers['NO_PROVINCIA'].isnull()]

x = df_conocidos.drop(columns=['ID','NO_PROVINCIA'], axis=1)
y = df_conocidos['NO_PROVINCIA']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

cat_features = ['NO_DEPARTAMENTO', 'CO_TIPO_SEXO']
modelo = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', random_state=42, cat_features=cat_features, task_type='GPU', devices='0:1')
modelo.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10, verbose=False)

In [None]:
df_desconocidos = df_customers[df_customers['NO_PROVINCIA'].isnull()]
predicciones = modelo.predict(df_desconocidos.drop(['NO_PROVINCIA', 'ID'], axis=1))

In [None]:
df_customers_imputed = df_customers.copy()
df_customers_imputed.loc[df_customers_imputed['NO_PROVINCIA'].isnull(), 'NO_PROVINCIA'] = predicciones

In [None]:
df_customers_imputed.to_parquet('customers_imputed_no_provincia.gzip', compression='gzip')

# *Merging balances with customers*

In [None]:
df_join = df_customers.merge(
    df_balances_unirow, how='left', on='ID'
)
df_join['NO_DEPARTAMENTO'] = df_join['NO_DEPARTAMENTO'].astype(str).replace('nan', np.nan)

## *Imputing `NO_DEPARTAMENTO`*

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
df_encoded = pd.DataFrame(encoder.fit_transform(df_join[['NO_DEPARTAMENTO']]), columns=['NO_DEPARTAMENTO'])

nc, cc = get_cols(df_join)
df_combined = pd.concat([df_join[nc], df_encoded], axis=1)

imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df_combined), columns=df_combined.columns)

df_imputed['NO_DEPARTAMENTO'] = encoder.inverse_transform(df_imputed[['NO_DEPARTAMENTO']])
df_imputed[['EDAD', 'NO_DEPARTAMENTO']].to_parquet('../data/processed/customers_departamento_imputed.gzip', compression='gzip')
del df_imputed

In [None]:
df_dep = pd.read_parquet('../data/processed/customers_departamento_imputed.gzip')

In [None]:
df_join['NO_DEPARTAMENTO'] = df_dep['NO_DEPARTAMENTO']
df_join['REGION'] = df_join['NO_DEPARTAMENTO'].apply(asignar_region)

In [None]:
df_no_provincia = pd.read_parquet('../data/processed/customers_imputed_no_provincia.gzip')

In [None]:
df_join['NO_PROVINCIA'] = df_no_provincia['NO_PROVINCIA']

In [None]:
df_train = pd.read_csv('../data/raw/train.csv').merge(df_join, how='left', on='ID')
df_test = pd.read_csv('../data/raw/test.csv').merge(df_join, how='left', on='ID')

# *Pivoting balances*

In [None]:
selected_cols = [
    col for col in df_balances_unirow.columns 
    if (
        all([sub in col for sub in ['DIFF', '_1']]) or 
        # all([sub in col for sub in ['VAR', '_1']]) or 
        any([col.startswith(sub) for sub in ['PERIODO', 'ID', 'SALDO', 'MAX_LINEA_DISP_U6M']])) 
        and 'FLAG' not in col
    ]
print(selected_cols)

diff_mensual_pivoted = (
    df_balances_unirow
    [selected_cols]
    .dropna(how='all', subset=selected_cols)
    .pivot_table(index=['ID'], columns=['PERIODO'], values=selected_cols)
    .reset_index()
)
diff_mensual_pivoted.columns = ['_'.join(col).strip() for col in diff_mensual_pivoted.columns.values]
# diff_mensual_pivoted = diff_mensual_pivoted.replace([np.inf, -np.inf], np.nan)
diff_mensual_pivoted = diff_mensual_pivoted.rename(columns={'ID_':'ID'})

In [None]:
df_train = df_train.merge(diff_mensual_pivoted, on='ID', how='left')
df_test = df_test.merge(diff_mensual_pivoted, on='ID', how='left')

# *Saving train test data*

In [None]:
df_train.dropna(how='any').to_parquet('../data/processed/train.gzip', compression='gzip')
df_test.to_parquet('../data/processed/test.gzip', compression='gzip')