In [116]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent.parent))

import polars as pl
import pandas as pd
from src.utils.utils_fn import capture_variables, gather_variable_info

import warnings
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [92]:
# Definir la ruta absoluta para la carpeta de pipelines
root_path = Path.cwd().resolve().parent.parent

# Crear el directorio si no existe
root_path.mkdir(parents=True, exist_ok=True)

# Lectura del dataset
data = pd.read_parquet(
    path=str(root_path / 'data/processed/data_processed.parquet'), 
)

# Setear los ids como índices
data: pd.DataFrame = data.set_index('product_id')
data.sample(5, random_state=10)

Unnamed: 0_level_0,condition,state,city,local_pickup,free_shipping,shipping_mode,listing_type,buying_mode,attribute_group_id,attribute_group,...,status,accepts_mercadopago,currency,automatic_relist,title,stock_quantity,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mla5501620002,used,capital federal,nuñez,True,False,not_specified,bronze,buy_it_now,,,...,active,True,ars,False,timbre inahalambrico,1,1,0.0,0.000833,1440.0
mla2357269269,used,buenos aires,avellaneda,True,False,not_specified,bronze,buy_it_now,dflt,otros,...,active,True,ars,False,lote de 2 cinturones. 1 nuevo con etiqueta.mic...,8,8,0.0,695.485278,1440.0
mla4505955642,used,buenos aires,acassuso,True,False,me2,bronze,buy_it_now,,,...,active,True,ars,False,revista instituto de historia del derecho rica...,3,3,0.0,0.000833,1440.0
mla7853937105,used,capital federal,retiro,True,False,not_specified,free,buy_it_now,,,...,active,True,ars,False,susan sontag - la enfermedad y sus metaforas -...,1,1,0.0,0.000833,1440.0
mla7813601724,new,capital federal,almagro,True,False,not_specified,silver,buy_it_now,,,...,active,True,ars,False,vendas cambric marca vendsur de 10cm x 3mt en ...,7,7,2010.0,0.000556,1440.0


In [119]:
# Separación de los conjuntos de datos
# ===================================================================================================================
from sklearn.model_selection import train_test_split

# Inicializar una semilla
SEED = 25

# Separamos los features y el target
X = data.loc[:, data.columns != 'condition'] # type: ignore
y = data.loc[:, data.columns == 'condition'].squeeze() # type: ignore

# Dividir el conjunto original en 70% entrenamiento y 30% para pruebas y validación
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.4, 
    random_state=SEED, 
    stratify=y
)

# Luego, dividir el 30% restante en 20% para validación y 10% para pruebas
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=1/2, 
    random_state=SEED, 
    stratify=y_temp
)

| Subconjunto   | Proporción | Descripción                                                 |
|---------------|------------|-------------------------------------------------------------|
| Entrenamiento | 60%        | Se usa para entrenar el modelo.                             |
| Validación    | 20%        | Se usa para afinar hiperparámetros y evaluar durante el ajuste. |
| Prueba        | 20%        | Se usa para evaluar el rendimiento final del modelo.        |

In [94]:
from sklearn.preprocessing import LabelEncoder

# Codificación del target
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train))
y_val = pd.Series(le.fit_transform(y_val))
y_test = pd.Series(le.fit_transform(y_test))

In [120]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(
    data=X_train
)

		Tipos de variables
Hay 5 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 15 variables categóricas


In [121]:
# Capturar información de las variables
info = gather_variable_info(
    X=X_train,
    continuous=continuous,
    categoricals=categoricals,
    discretes=discretes,
    missing_threshold=0.05,       
    cardinality_threshold=5 
)

{
    "continuous_more_than_threshold": [],
    "continuous_less_than_threshold": [],
    "categoricals_more_than_threshold": [
        "attribute_group_id",
        "attribute_group",
        "attribute_id"
    ],
    "categoricals_less_than_threshold": [
        "state",
        "city"
    ],
    "categoricals_high_cardinality": [
        "state",
        "city",
        "listing_type",
        "attribute_group_id",
        "attribute_group",
        "attribute_id",
        "title"
    ],
    "categoricals_low_cardinality": [
        "local_pickup",
        "free_shipping",
        "shipping_mode",
        "buying_mode",
        "status",
        "accepts_mercadopago",
        "currency",
        "automatic_relist"
    ],
    "discretes_more_than_threshold": [],
    "discretes_less_than_threshold": [],
    "discretes_high_cardinality": [],
    "discretes_low_cardinality": []
}


In [123]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropConstantFeatures
from feature_engine.selection import DropFeatures
from feature_engine.selection import DropCorrelatedFeatures
from feature_engine.preprocessing import MatchVariables, MatchCategories

high_cardinality = ['title', 'city']

# Pipeline de procesadores
pipe = Pipeline([
    ('drop-features', DropFeatures(features_to_drop=info['categoricals_more_than_threshold'] + high_cardinality)),
    ('constant-features', DropConstantFeatures(variables=[var for var in continuous + categoricals if var not in info['categoricals_more_than_threshold'] + high_cardinality], 
                                               missing_values='ignore',
                                               tol=0.95)),
    ('match-features', MatchVariables(missing_values='ignore')),
    ('correlated-features', DropCorrelatedFeatures(variables=continuous,
                                                   missing_values='ignore',
                                                   method='pearson',
                                                   threshold=0.8)),
    ('match-categories', MatchCategories(missing_values='ignore')),
])

pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [124]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(
    data=X_train
)

		Tipos de variables
Hay 4 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 4 variables categóricas


In [125]:
# Capturar información de las variables
info = gather_variable_info(
    X=X_train,
    continuous=continuous,
    categoricals=categoricals,
    discretes=discretes,
    missing_threshold=0.05,       
    cardinality_threshold=5 
)

{
    "continuous_more_than_threshold": [],
    "continuous_less_than_threshold": [],
    "categoricals_more_than_threshold": [],
    "categoricals_less_than_threshold": [
        "state"
    ],
    "categoricals_high_cardinality": [
        "state",
        "listing_type"
    ],
    "categoricals_low_cardinality": [
        "local_pickup",
        "shipping_mode"
    ],
    "discretes_more_than_threshold": [],
    "discretes_less_than_threshold": [],
    "discretes_high_cardinality": [],
    "discretes_low_cardinality": []
}
