In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import category_encoders as ce

df = pd.read_csv("census.csv")


# Ver dataset inicial
print("Primeras 5 filas:")
print(df.head(), "\n")

print("Últimas 5 filas:")
print(df.tail(), "\n")

print("Información general:")
print(df.info(), "\n")

# Verificar datos faltantes
print("Cantidad de datos faltantes por columna:")
print(df.isna().sum(), "\n")

# Descartar filas con 3 o más datos faltantes 
df = df[df.isna().sum(axis=1) < 3]
df = df.reset_index(drop=True)
print("Datos tras descartar filas con 3 o más NaN:")
print(df.isna().sum(), "\n")

# Imputar datos faltantes de 'age' y 'hours_per_week' con la mediana 
df[['age', 'hours_per_week']] = df[['age', 'hours_per_week']].fillna(df[['age', 'hours_per_week']].median())
print("Datos faltantes tras imputación de medianas:")
print(df[['age','hours_per_week']].isna().sum(), "\n")

# Eliminar filas con NaN en columnas críticas 
cols_criticas = ['age', 'education', 'sex', 'hours_per_week', 'USA_born', 'label', 'race', 'workclass']
df = df.dropna(subset=cols_criticas).reset_index(drop=True)

# One-hot encoding de 'workclass' 
encoder_workclass = OneHotEncoder(sparse_output=False)
encoded_workclass = encoder_workclass.fit_transform(df[['workclass']])
encoded_workclass_df = pd.DataFrame(encoded_workclass, columns=encoder_workclass.get_feature_names_out(['workclass']))
df = pd.concat([df.drop(columns=['workclass']), encoded_workclass_df], axis=1)

# Valores únicos de 'education' 
print("Valores únicos en 'education':")
print(df['education'].unique(), "\n")
print(df['education'].value_counts(), "\n")

# Ordinal encoding de 'education' 
education_order = [
    'Preschool', 'Elementary-school', 'Some-middle-school', 'Middle-school',
    'Some-high-school', 'High-school', 'Some-college', 'Bachelors',
    'Masters', 'Doctorate'
]
edu_encoder = OrdinalEncoder(categories=[education_order])
df['education_encoded'] = edu_encoder.fit_transform(df[['education']])

print("Registros de 'education' con Ordinal encoding:")
print(df[['education', 'education_encoded']], "\n")

# One-hot encoding de 'race'
encoder_race = OneHotEncoder(sparse_output=False)
encoded_race = encoder_race.fit_transform(df[['race']])
encoded_race_df = pd.DataFrame(encoded_race, columns=encoder_race.get_feature_names_out(['race']))
df = pd.concat([df.drop(columns=['race']), encoded_race_df], axis=1)
print("\nRegistros de 'race' con One-hot encoding:")
print(df)

# Binary encoding de 'sex'
encoder_sex = ce.BinaryEncoder(cols=['sex'])
df = encoder_sex.fit_transform(df)
print("\nRegistros de 'sex' con Binary encoding:")
print(df.head(10))

# Binary encoding de 'label'
encoder_label = ce.BinaryEncoder(cols=['label'])
df = encoder_label.fit_transform(df)
print("\nRegistros de 'label' con Binary encoding:")
print(df)

# Revisión final del dataset
print("Primeras 5 filas del dataset final:")
print(df.head(), "\n")

print("Últimas 5 filas del dataset final:")
print(df.tail(), "\n")

print("Forma del dataset (filas, columnas):")
print(df.shape, "\n")

print("Estadísticas descriptivas (numéricas):")
print(df.describe(), "\n")

print("Información general de columnas:")
print(df.info(), "\n")

print("Conteo de valores nulos por columna:")
print(df.isna().sum())

df.to_csv("census2.csv", index=False)
print("El archivo se guardo correctamente como census2")

Primeras 5 filas:
    age         workclass         education   race     sex  hours_per_week  \
0  39.0         State-gov         Bachelors  White    Male            40.0   
1  50.0  Self-emp-not-inc         Bachelors  White    Male            13.0   
2  38.0           Private       High-school  White    Male            40.0   
3  53.0           Private  Some-high-school  Black    Male            40.0   
4  28.0           Private         Bachelors  Black  Female            40.0   

   USA_born  label  
0       1.0  <=50K  
1       1.0  <=50K  
2       1.0  <=50K  
3       1.0  <=50K  
4       0.0  <=50K   

Últimas 5 filas:
        age     workclass  education                race     sex  \
41711  33.0       Private  Bachelors               White    Male   
41712  39.0       Private  Bachelors               White  Female   
41713  38.0       Private  Bachelors               White    Male   
41714  44.0       Private  Bachelors  Asian-Pac-Islander    Male   
41715  35.0  Self-emp-inc  B