# Preprocesamiento de Datos - Ejemplo Práctico

En esta demostración se preprocesará un conjunto de datos de población estadounidense. Los datos utilizados son un subconjunto modificado de [este set de datos](https://archive.ics.uci.edu/ml/datasets/Adult) y se encuentran en el archivo `census.csv`.

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [2]:
# Importemos los datos:
data = pd.read_csv('census.csv', header = 0)

In [3]:
# Veamos el dataset:
data

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,Male,40.0,1.0,<=50K
1,50.0,Self-emp-not-inc,Bachelors,White,Male,13.0,1.0,<=50K
2,38.0,Private,High-school,White,Male,40.0,1.0,<=50K
3,53.0,Private,Some-high-school,Black,Male,40.0,1.0,<=50K
4,28.0,Private,Bachelors,Black,Female,40.0,0.0,<=50K
...,...,...,...,...,...,...,...,...
41711,33.0,Private,Bachelors,White,Male,40.0,1.0,<=50K
41712,39.0,Private,Bachelors,White,Female,36.0,1.0,<=50K
41713,38.0,Private,Bachelors,White,Male,50.0,1.0,<=50K
41714,44.0,Private,Bachelors,Asian-Pac-Islander,Male,40.0,1.0,<=50K


In [4]:
# Descripción de las columnas:
data.describe(include='all')

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
count,41617.0,41705,41702,41700,41701,41631.0,41701.0,41716
unique,,7,10,5,2,,,2
top,,Private,High-school,White,Male,,,<=50K
freq,,30962,14969,35670,28191,,,31813
mean,38.476608,,,,,40.74322,0.895062,
std,13.365972,,,,,12.000085,0.306477,
min,17.0,,,,,1.0,0.0,
25%,28.0,,,,,40.0,1.0,
50%,37.0,,,,,40.0,1.0,
75%,47.0,,,,,45.0,1.0,


In [5]:
data['USA_born'] = data['USA_born'].astype('category')

In [7]:
data['education'].unique()

array(['Bachelors', 'High-school', 'Some-high-school', 'Masters',
       'Some-college', 'Middle-school', 'Doctorate', 'Some-middle-school',
       'Preschool', 'Elementary-school', nan], dtype=object)

In [8]:
# Verifiquemos si hay datos faltantes:
data.isna().sum()

age               99
workclass         11
education         14
race              16
sex               15
hours_per_week    85
USA_born          15
label              0
dtype: int64

In [9]:
# Descartemos las columnas que tengan más del 20% de datos faltantes:

# Paso 1: Calcular eñ número total de filas
total_rows = len(data)

# Paso 2: Contar el número de valores faltantes en cada columna
missing_counts = data.isna().sum()

# Paso 3: Calcular el porcentaje de datos faltantes en cada columna
missing_percentage = (missing_counts / total_rows) * 100

# Paso 4: Filtrar las columnas con más del 20% de datos faltantes
columns_to_keep = missing_percentage[missing_percentage <= 20].index
data_filtered = data[columns_to_keep]

#Solo por facilidad
data = data_filtered.copy()

In [10]:
columns_to_keep

Index(['age', 'workclass', 'education', 'race', 'sex', 'hours_per_week',
       'USA_born', 'label'],
      dtype='object')

## Pipeline de sklearn

- Crearemos un Pipeline con las siguientes etapas:

1. Identificar los tipos de columnas
2.

In [11]:
# Pipeline Paso 1: Identificar los tipos de columnas
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns

print(numerical_cols)
print(categorical_cols)

Index(['age', 'hours_per_week'], dtype='object')
Index(['workclass', 'education', 'race', 'sex', 'USA_born', 'label'], dtype='object')


In [12]:
# Pipeline Paso 2: Defina las etapas del pre-procesamiento
# Para datos numéricos
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # or median, etc.
    ('scaler', StandardScaler())
])

# Para datos categóricos
categorical_transformer = ColumnTransformer(
    transformers=[
        ('onehotwk', OneHotEncoder(), ['workclass','race']),  # Apply OneHotEncoder to 'workclass'
        ('ordinal', OrdinalEncoder(), ['education', 'sex', 'label'])  # Apply OrdinalEncoder to these columns
    ])

In [13]:
#Pipeline Paso 3: Crear el ColumnTransformer
# Step 3: Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [14]:
#Pipeline Paso 4: Crear el Pipeline de procesamiento de datos
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [15]:
# Paso 5: Transformar los datos
pipeline.fit(data)
transformed_data = pipeline.transform(data)

In [16]:
transformed_data

array([[ 0.03920555, -0.06199852,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.86317916, -2.31430539,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.03570114, -0.06199852,  0.        , ...,  3.        ,
         1.        ,  0.        ],
       ...,
       [-0.03570114,  0.77218921,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.41373901, -0.06199852,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.26042121,  1.60637695,  0.        , ...,  0.        ,
         1.        ,  1.        ]])

In [17]:
# Veamos el dataset resultante:
data


Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,Male,40.0,1.0,<=50K
1,50.0,Self-emp-not-inc,Bachelors,White,Male,13.0,1.0,<=50K
2,38.0,Private,High-school,White,Male,40.0,1.0,<=50K
3,53.0,Private,Some-high-school,Black,Male,40.0,1.0,<=50K
4,28.0,Private,Bachelors,Black,Female,40.0,0.0,<=50K
...,...,...,...,...,...,...,...,...
41711,33.0,Private,Bachelors,White,Male,40.0,1.0,<=50K
41712,39.0,Private,Bachelors,White,Female,36.0,1.0,<=50K
41713,38.0,Private,Bachelors,White,Male,50.0,1.0,<=50K
41714,44.0,Private,Bachelors,Asian-Pac-Islander,Male,40.0,1.0,<=50K


In [19]:
# Veamos la nueva descripción del dataset:
data.describe(include='all')

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
count,41617.0,41705,41702,41700,41701,41631.0,41701.0,41716
unique,,7,10,5,2,,2.0,2
top,,Private,High-school,White,Male,,1.0,<=50K
freq,,30962,14969,35670,28191,,37325.0,31813
mean,38.476608,,,,,40.74322,,
std,13.365972,,,,,12.000085,,
min,17.0,,,,,1.0,,
25%,28.0,,,,,40.0,,
50%,37.0,,,,,40.0,,
75%,47.0,,,,,45.0,,


In [20]:
# Verifiquemos el tipo de dato de cada columna:
data.dtypes

age                float64
workclass           object
education           object
race                object
sex                 object
hours_per_week     float64
USA_born          category
label               object
dtype: object

In [21]:
# Carguemos el dataset a un nuevo archivo:
data.to_csv('./census_processed1.csv', index=False)