In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Creamos un DataFrame de ejemplo
data = {
    'Age': [25, 30, np.nan, 22, 35],
    'Income': [50000, np.nan, 60000, 75000, 80000],
    'Gender': ['Male', 'Female', 'Male', np.nan, 'Female'],
    'Target': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Dividir en características (X) y la variable objetivo (y)
x = df.drop('Target', axis=1)
y = df['Target']

# Dividir los datos en entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### Pipeline para imputar, normalizar y codificar

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Definimos las columnas numéricas y categóricas para cada
numeric_features = x_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = x_train.select_dtypes(include=['object']).columns

# Creamos transformers para imputación y escala
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Creamos transformer para codificación de variables categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Creamos un preprocesador que aplique los transformers apropiados a las columnas correctas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creamos el pipeline completo
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Pipeline para, solo imputar y codificar

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Identifica las columnas numéricas y categóricas
numeric_features = x_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = x_train.select_dtypes(include=['object']).columns

# Define los transformadores
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crea el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Crea el pipeline completo
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

Ejecuta solo uno de los dos

In [3]:
# asi se usa el pipeline 
x_train_prep = pipeline.fit_transform(x_train)
x_test_prep = pipeline.transform(x_test)

#### Para transformar a un df despues de aplicar el pipeline

In [4]:
# x_train_prep es la variable donde se guardo el df transformado al usar el pipeline 

x_train = pd.DataFrame(x_train_prep, columns=list(pd.get_dummies(x_train)), index=x_train.index)

In [6]:
x_train

Unnamed: 0,Age,Income,Gender_Female,Gender_Male
4,1.59285,1.153113,1.0,0.0
2,0.0,-0.524142,0.0,1.0
0,-0.48478,-1.36277,0.0,1.0
3,-1.108069,0.733799,0.0,1.0
