## *02. Preprocesamiento de los datos*
El preprocesamiento de datos es una etapa crítica en la preparación de datos antes de su análisis o uso en aplicaciones de machine learning. Consiste en una serie de técnicas y transformaciones que se aplican a los datos brutos con el objetivo de mejorar su calidad, consistencia y relevancia para el análisis.

En resumen, el preprocesamiento de datos es una etapa crítica en la preparación de datos antes de su análisis o uso en aplicaciones de machine learning. Se utiliza para mejorar la calidad, consistencia y relevancia de los datos mediante diversas técnicas y transformaciones.

In [1]:
import sys
sys.path.append('../src/utils')

# Librerías
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload 2

In [2]:
# Lectura de los datos
data = pd.read_csv('../data/raw/adult.data', header=None)
names = pd.read_csv('../data/raw/col_names.txt').T.iloc[0].tolist()
data.columns = names
data.sample(5, random_state=777)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
29305,32,Private,130040,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
4110,25,Private,186294,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,50,United-States,<=50K
12044,39,Private,237943,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,30,United-States,>50K
17027,78,Self-emp-inc,237294,HS-grad,9,Widowed,Sales,Not-in-family,White,Male,0,0,45,United-States,>50K
5821,53,Private,157069,Assoc-acdm,12,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [3]:
# Manejar el nombre de los predictores
data = data.rename(columns=lambda col: str(col).lower().strip())
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Duplicados
print(f'Tamaño original: {data.shape}')
data.drop_duplicates(inplace=True, ignore_index=True)
print(f'Tamaño sin duplicados: {data.shape}')

Tamaño original: (32561, 15)
Tamaño sin duplicados: (32537, 15)


In [5]:
# Reemplazar valores faltantes de distintas fuentes a np.nan
data = data.fillna(np.nan)

# Valores faltantes
data.isnull().mean().sort_values(ascending=False)

age              0.00
workclass        0.00
fnlwgt           0.00
education        0.00
education-num    0.00
marital-status   0.00
occupation       0.00
relationship     0.00
race             0.00
sex              0.00
capital-gain     0.00
capital-loss     0.00
hours-per-week   0.00
native-country   0.00
income           0.00
dtype: float64

In [6]:
# Uniformizar los predictores categóricas
categoricals = list(data.select_dtypes(include=['object', 'bool']).columns)
data[categoricals] = data[categoricals].applymap(lambda x: str(x).strip().lower())
data[categoricals].sample(5, random_state=42)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
3643,state-gov,assoc-voc,married-civ-spouse,craft-repair,husband,white,male,united-states,<=50k
16036,federal-gov,bachelors,never-married,exec-managerial,not-in-family,white,male,united-states,<=50k
9401,local-gov,some-college,married-civ-spouse,other-service,husband,asian-pac-islander,male,philippines,<=50k
17903,private,some-college,never-married,exec-managerial,not-in-family,white,male,united-states,<=50k
5198,federal-gov,bachelors,never-married,exec-managerial,not-in-family,white,male,united-states,>50k


In [7]:
# Validar los cambios
for col in data[categoricals]:
    print(data[col].unique(), sep='\n')

['state-gov' 'self-emp-not-inc' 'private' 'federal-gov' 'local-gov' '?'
 'self-emp-inc' 'without-pay' 'never-worked']
['bachelors' 'hs-grad' '11th' 'masters' '9th' 'some-college' 'assoc-acdm'
 'assoc-voc' '7th-8th' 'doctorate' 'prof-school' '5th-6th' '10th'
 '1st-4th' 'preschool' '12th']
['never-married' 'married-civ-spouse' 'divorced' 'married-spouse-absent'
 'separated' 'married-af-spouse' 'widowed']
['adm-clerical' 'exec-managerial' 'handlers-cleaners' 'prof-specialty'
 'other-service' 'sales' 'craft-repair' 'transport-moving'
 'farming-fishing' 'machine-op-inspct' 'tech-support' '?'
 'protective-serv' 'armed-forces' 'priv-house-serv']
['not-in-family' 'husband' 'wife' 'own-child' 'unmarried' 'other-relative']
['white' 'black' 'asian-pac-islander' 'amer-indian-eskimo' 'other']
['male' 'female']
['united-states' 'cuba' 'jamaica' 'india' '?' 'mexico' 'south'
 'puerto-rico' 'honduras' 'england' 'canada' 'germany' 'iran'
 'philippines' 'italy' 'poland' 'columbia' 'cambodia' 'thailand' '

In [8]:
# Reparar las etiquetas
data[categoricals] = data[categoricals].replace('?', np.nan)

# Validar los cambios
for col in data[categoricals]:
    print(data[col].unique(), sep='\n')

['state-gov' 'self-emp-not-inc' 'private' 'federal-gov' 'local-gov' nan
 'self-emp-inc' 'without-pay' 'never-worked']
['bachelors' 'hs-grad' '11th' 'masters' '9th' 'some-college' 'assoc-acdm'
 'assoc-voc' '7th-8th' 'doctorate' 'prof-school' '5th-6th' '10th'
 '1st-4th' 'preschool' '12th']
['never-married' 'married-civ-spouse' 'divorced' 'married-spouse-absent'
 'separated' 'married-af-spouse' 'widowed']
['adm-clerical' 'exec-managerial' 'handlers-cleaners' 'prof-specialty'
 'other-service' 'sales' 'craft-repair' 'transport-moving'
 'farming-fishing' 'machine-op-inspct' 'tech-support' nan
 'protective-serv' 'armed-forces' 'priv-house-serv']
['not-in-family' 'husband' 'wife' 'own-child' 'unmarried' 'other-relative']
['white' 'black' 'asian-pac-islander' 'amer-indian-eskimo' 'other']
['male' 'female']
['united-states' 'cuba' 'jamaica' 'india' nan 'mexico' 'south'
 'puerto-rico' 'honduras' 'england' 'canada' 'germany' 'iran'
 'philippines' 'italy' 'poland' 'columbia' 'cambodia' 'thailand' '

In [9]:
# Valores faltantes
data.isnull().mean().sort_values(ascending=False)

occupation       0.06
workclass        0.06
native-country   0.02
age              0.00
fnlwgt           0.00
education        0.00
education-num    0.00
marital-status   0.00
relationship     0.00
race             0.00
sex              0.00
capital-gain     0.00
capital-loss     0.00
hours-per-week   0.00
income           0.00
dtype: float64

In [10]:
# Exportar los resultados
data.to_csv('../data/interim/data_preprocessed.csv', index=False)

---
---