## *02. Preprocesamiento de los datos*
El preprocesamiento de datos es una etapa crítica en la preparación de datos antes de su análisis o uso en aplicaciones de machine learning. Consiste en una serie de técnicas y transformaciones que se aplican a los datos brutos con el objetivo de mejorar su calidad, consistencia y relevancia para el análisis.

En resumen, el preprocesamiento de datos es una etapa crítica en la preparación de datos antes de su análisis o uso en aplicaciones de machine learning. Se utiliza para mejorar la calidad, consistencia y relevancia de los datos mediante diversas técnicas y transformaciones.

In [14]:
import sys
sys.path.append('../src/utils')

# Librerías
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Lectura de los datos
data = pd.read_csv('../data/raw/adult.data', header=None)
names = pd.read_csv('../data/raw/col_names.txt').T.iloc[0].tolist()
data_test = pd.read_csv('../data/raw/adult.test', header=None)

# Renombrar las columnas
data.columns = names
data_test.columns = names

# Concatenar para tener un único dataframe
data = pd.concat([data, data_test], axis=0, ignore_index=True)
data.sample(5, random_state=777)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
36155,39,Private,118286,Some-college,10,Married-civ-spouse,Sales,Husband,Black,Male,0,0,40,United-States,<=50K.
38493,59,State-gov,200732,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,50,Philippines,>50K.
38309,27,Private,103524,HS-grad,9,Never-married,Handlers-cleaners,Unmarried,White,Male,0,0,35,United-States,<=50K.
48315,33,Private,92462,Assoc-acdm,12,Never-married,Sales,Unmarried,Black,Male,0,0,32,United-States,<=50K.
47589,39,Private,134367,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Female,0,0,43,United-States,<=50K.


In [16]:
# Manejar el nombre de los predictores
data = data.rename(columns=lambda col: str(col).lower().strip())
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [17]:
# Duplicados
print(f'Tamaño original: {data.shape}')
data.drop_duplicates(inplace=True, ignore_index=True)
print(f'Tamaño sin duplicados: {data.shape}')

Tamaño original: (48842, 15)
Tamaño sin duplicados: (48813, 15)


In [18]:
# Reemplazar valores faltantes de distintas fuentes a np.nan
data = data.fillna(np.nan)

# Valores faltantes
data.isnull().mean().sort_values(ascending=False)

age              0.00
workclass        0.00
fnlwgt           0.00
education        0.00
education-num    0.00
marital-status   0.00
occupation       0.00
relationship     0.00
race             0.00
sex              0.00
capital-gain     0.00
capital-loss     0.00
hours-per-week   0.00
native-country   0.00
income           0.00
dtype: float64

In [19]:
# Uniformizar los predictores categóricas
categoricals = list(data.select_dtypes(include=['object', 'bool']).columns)
data[categoricals] = data[categoricals].applymap(lambda x: str(x).strip().lower())
data[categoricals].sample(5, random_state=42)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
14701,private,hs-grad,never-married,craft-repair,not-in-family,white,male,united-states,<=50k
15073,private,some-college,never-married,craft-repair,own-child,white,male,united-states,<=50k
17837,private,some-college,never-married,machine-op-inspct,not-in-family,white,female,united-states,<=50k
11732,private,bachelors,divorced,prof-specialty,own-child,white,female,united-states,<=50k
35372,private,hs-grad,never-married,other-service,own-child,white,male,united-states,<=50k.


In [20]:
# Validar los cambios
for col in data[categoricals]:
    print(data[col].unique(), sep='\n')

['state-gov' 'self-emp-not-inc' 'private' 'federal-gov' 'local-gov' '?'
 'self-emp-inc' 'without-pay' 'never-worked']
['bachelors' 'hs-grad' '11th' 'masters' '9th' 'some-college' 'assoc-acdm'
 'assoc-voc' '7th-8th' 'doctorate' 'prof-school' '5th-6th' '10th'
 '1st-4th' 'preschool' '12th']
['never-married' 'married-civ-spouse' 'divorced' 'married-spouse-absent'
 'separated' 'married-af-spouse' 'widowed']
['adm-clerical' 'exec-managerial' 'handlers-cleaners' 'prof-specialty'
 'other-service' 'sales' 'craft-repair' 'transport-moving'
 'farming-fishing' 'machine-op-inspct' 'tech-support' '?'
 'protective-serv' 'armed-forces' 'priv-house-serv']
['not-in-family' 'husband' 'wife' 'own-child' 'unmarried' 'other-relative']
['white' 'black' 'asian-pac-islander' 'amer-indian-eskimo' 'other']
['male' 'female']
['united-states' 'cuba' 'jamaica' 'india' '?' 'mexico' 'south'
 'puerto-rico' 'honduras' 'england' 'canada' 'germany' 'iran'
 'philippines' 'italy' 'poland' 'columbia' 'cambodia' 'thailand' '

In [21]:
# Reparar las etiquetas
data[categoricals] = data[categoricals].replace('?', np.nan)

# Validar los cambios
for col in data[categoricals]:
    print(data[col].unique(), sep='\n')

['state-gov' 'self-emp-not-inc' 'private' 'federal-gov' 'local-gov' nan
 'self-emp-inc' 'without-pay' 'never-worked']
['bachelors' 'hs-grad' '11th' 'masters' '9th' 'some-college' 'assoc-acdm'
 'assoc-voc' '7th-8th' 'doctorate' 'prof-school' '5th-6th' '10th'
 '1st-4th' 'preschool' '12th']
['never-married' 'married-civ-spouse' 'divorced' 'married-spouse-absent'
 'separated' 'married-af-spouse' 'widowed']
['adm-clerical' 'exec-managerial' 'handlers-cleaners' 'prof-specialty'
 'other-service' 'sales' 'craft-repair' 'transport-moving'
 'farming-fishing' 'machine-op-inspct' 'tech-support' nan
 'protective-serv' 'armed-forces' 'priv-house-serv']
['not-in-family' 'husband' 'wife' 'own-child' 'unmarried' 'other-relative']
['white' 'black' 'asian-pac-islander' 'amer-indian-eskimo' 'other']
['male' 'female']
['united-states' 'cuba' 'jamaica' 'india' nan 'mexico' 'south'
 'puerto-rico' 'honduras' 'england' 'canada' 'germany' 'iran'
 'philippines' 'italy' 'poland' 'columbia' 'cambodia' 'thailand' '

In [22]:
# Valores faltantes
data.isnull().mean().sort_values(ascending=False)

occupation       0.06
workclass        0.06
native-country   0.02
age              0.00
fnlwgt           0.00
education        0.00
education-num    0.00
marital-status   0.00
relationship     0.00
race             0.00
sex              0.00
capital-gain     0.00
capital-loss     0.00
hours-per-week   0.00
income           0.00
dtype: float64

In [25]:
# Arreglando el target
data['income'] = data['income'].replace({'>50k.': '>50k',
                                         '<=50k.': '<=50k'})
data['income'].value_counts()

income
<=50k    37128
>50k     11685
Name: count, dtype: int64

In [None]:
# Exportar los resultados
data.to_csv('../data/interim/data_preprocessed.csv', index=False)

---
---