In [14]:
import warnings
import sys
sys.path.append('../src/utils')

# Core
from utils_functions import *
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format
warnings.simplefilter('ignore')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Lectura de los datos
data = pd.read_csv('../data/interim/data_preprocessed.csv')
data.sample(5, random_state=777)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
11613,52,private,174767,some-college,10,married-civ-spouse,exec-managerial,husband,white,male,0,0,45,united-states,>50k
48350,29,private,85572,bachelors,13,never-married,exec-managerial,other-relative,white,female,0,0,40,united-states,<=50k
19694,18,private,338632,11th,7,never-married,other-service,own-child,white,male,0,0,16,united-states,<=50k
6106,19,private,375114,hs-grad,9,never-married,craft-repair,not-in-family,white,female,0,0,40,united-states,<=50k
2340,27,,253873,some-college,10,divorced,,not-in-family,white,female,0,0,25,united-states,<=50k


In [16]:
from sklearn.model_selection import train_test_split

# Separamos los features y el target
X = data.loc[:, data.columns != 'income']
y = data.loc[:, data.columns == 'income'].squeeze()

# Dividir el conjunto original en 70% entrenamiento y 30% para pruebas y validación
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

# Luego, dividir el 30% restante en 20% para validación y 10% para pruebas
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, random_state=123, stratify=y_temp)

print(f'70% Train set: {X_train.shape, y_train.shape}')
print(f'20% Validation set: {X_val.shape, y_val.shape}')
print(f'10% Test set: {X_test.shape, y_test.shape}')

70% Train set: ((34169, 14), (34169,))
20% Validation set: ((9762, 14), (9762,))
10% Test set: ((4882, 14), (4882,))


In [17]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(data=X)

		Tipos de variables
Hay 6 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 8 variables categóricas


In [18]:
# === Variables Continuas ===
# Capturemos las variables con alto porcentaje de datos faltantes (más del 5%)
continuous_more_than_5perc = [var for var in continuous if X[var].isnull().mean() > 0.05]
print(f'Variables continuas por encima del 5% de datos faltantes:\n{continuous_more_than_5perc}\n')

# Capturemos las variables con menor porcentaje de datos faltantes (menos del 5%)
continuous_less_than_5perc = [var for var in continuous if X[var].isnull().sum() > 0 and X[var].isnull().mean() <= 0.05]
print(f'Variables continuas por debajo del 5% de datos faltantes:\n{continuous_less_than_5perc}\n')

# === Variables Categóricas ===
# Capturemos las variables con alto porcentaje de datos faltantes (más del 5%)
categoricals_more_than_5perc = [var for var in categoricals if X[var].isnull().mean() > 0.05]
print(f'Variables categóricas por encima del 5% de datos faltantes:\n{categoricals_more_than_5perc}\n')

# Capturemos las variables con menor porcentaje de datos faltantes (menos del 5%)
categoricals_less_than_5perc = [var for var in categoricals if X[var].isnull().sum() > 0 and X[var].isnull().mean() <= 0.05]
print(f'Variables categóricas por debajo del 5% de datos faltantes:\n{categoricals_less_than_5perc}\n')

Variables continuas por encima del 5% de datos faltantes:
[]

Variables continuas por debajo del 5% de datos faltantes:
[]

Variables categóricas por encima del 5% de datos faltantes:
['workclass', 'occupation']

Variables categóricas por debajo del 5% de datos faltantes:
['native-country']



In [19]:
# Variables categóricas con alta cardinalidad y baja cardinalidad
# Por medio del EDA definimos 2 etiquetas en la alta cardinalidad y 5 en la baja cardinalidad de las variables categóricas
categoricals_high_cardinality = [var for var in X[categoricals] if X[var].nunique() > 8]
categoricals_low_cardinality = [var for var in categoricals if var not in categoricals_high_cardinality]
print(f'Variables categórcias con alta cardinalidad: {categoricals_high_cardinality}')
print(f'Variables categórcias con baja cardinalidad: {categoricals_low_cardinality}')

Variables categórcias con alta cardinalidad: ['education', 'occupation', 'native-country']
Variables categórcias con baja cardinalidad: ['workclass', 'marital-status', 'relationship', 'race', 'sex']


In [20]:
from sklearn.preprocessing import LabelEncoder

# Codificación del target
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.fit_transform(y_val)
y_test = le.fit_transform(y_test)

# Pesos de las clases
unique_values, counts = np.unique(y_train, return_counts=True)
percentages = (counts / y_train.shape[0]) * 100
class_weight = dict(zip(sorted(unique_values, reverse=True), percentages))
class_weight

{1: 76.06309812988381, 0: 23.936901870116188}

In [21]:
# Pipeline
from sklearn.pipeline import Pipeline

# Selección de variables
from feature_engine.selection import DropConstantFeatures
from feature_engine.selection import DropDuplicateFeatures
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Ingeniería de variables
from feature_engine.imputation import RandomSampleImputer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.encoding import OrdinalEncoder


pipe = Pipeline([
    
    # === FILTRO BÁSICO ===
    # === Cuasi-constantes ===
    ('constant', DropConstantFeatures(tol=0.998, missing_values='ignore')),
    
    # === Duplicados ===
    ('duplicated', DropDuplicateFeatures(missing_values='ignore')),
    
    # === Correlacionados ===
    ('correlation', DropCorrelatedFeatures(method='pearson', threshold=0.8, missing_values='ignore')),
    
    # === IMPUTACIÓN ===
    # === Categóricas ===
    ('imputer_missing_categoricals_more_than_5perc', RandomSampleImputer(variables=categoricals_more_than_5perc, random_state=42)),
    ('imputer_missing_categoricals_less_than_5perc', RandomSampleImputer(variables=categoricals_less_than_5perc, random_state=42)),
    
    # === ETIQUETAS RARAS ===
    # === Categóricas ===
    ('rare_label_cat_high_cardinality', RareLabelEncoder(tol=0.05, n_categories=8, 
                                                         variables=categoricals_high_cardinality)),
    ('rare_label_cat_low_cardinality', RareLabelEncoder(tol=0.05, n_categories=8,
                                                        variables=categoricals_low_cardinality)),
    
    # === DISCRETIZACIÓN ===
    # === Discretizador ===
    ('discretiser', EqualFrequencyDiscretiser(variables=continuous, return_object=True)),
    
    # === CODIFICACIÓN ===
    ('encoder', OrdinalEncoder(encoding_method='ordered', variables=continuous+categoricals)), # Relación monotónica
    
    # === FEATURES ===
    ('features_selector', SelectFromModel(RandomForestClassifier(n_estimators=20, random_state=89, 
                                                                 class_weight=class_weight)))
])

In [22]:
# 1. Ajustemos el Pipeline con los datos de entrenamiento
pipe.fit(X_train, y_train)

# 2. Hacemos una transformación: trasladando los cambios del train a los otros conjuntos de datos
X_train = pd.DataFrame(pipe.transform(X_train), columns=pipe.get_feature_names_out(), index=X_train.index)
X_val = pd.DataFrame(pipe.transform(X_val), columns=pipe.get_feature_names_out(), index=X_val.index)
X_test = pd.DataFrame(pipe.transform(X_test), columns=pipe.get_feature_names_out(), index=X_test.index)

# Resultados de las transformaciones en la Pipeline
print(f"""\tResultados de las transformaciones
• Features constantes y cuasi-constantes: {pipe.named_steps['constant'].features_to_drop_}
• Features duplicados: {pipe.named_steps['duplicated'].features_to_drop_}
• Features correlacionados: {pipe.named_steps['correlation'].features_to_drop_}
• Features no seleccionados: {[feature for feature, selected in zip(X_train.columns, pipe.named_steps['features_selector'].get_support()) if selected]}""")

	Resultados de las transformaciones
• Features constantes y cuasi-constantes: []
• Features duplicados: set()
• Features correlacionados: set()
• Features no seleccionados: ['age', 'education-num', 'occupation', 'relationship', 'hours-per-week']


In [None]:
# Guardar el objeto Pipeline en un archivo
import joblib

del X_train['fnlwgt']
del X_val['fnlwgt']
del X_test['fnlwgt']

joblib.dump(pipe, '../models/pipe.pkl')
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_val.to_csv('../data/processed/X_val.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)

pd.Series(y_train).to_csv('../data/processed/y_train.csv', index=False)
pd.Series(y_val).to_csv('../data/processed/y_val.csv', index=False)
pd.Series(y_test).to_csv('../data/processed/y_test.csv', index=False)

---
---