Preprocess the data for the project. Need to Handle missing values, Standardize the numerical columns, then encode the categorical columns using the sklearn version of encoding

In [22]:
#Import block
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [23]:
#Import the data
adult_data = pd.read_csv('./data/project_adult.csv')
#adult_data.head()
validation_data = pd.read_csv('./data/project_validation_inputs.csv')
#validation_data.head()

# Training

In [24]:
#Separate features and labels and identify categorical and numerical features
X = adult_data.drop(columns=['income'], axis=1)
y = adult_data['income']
cat_columns = X.select_dtypes(include=['object']).columns
num_columns = X.select_dtypes(include=['int64', 'float64']).columns

In [25]:
#Create sklearn pipelines for preprocessing
"""
Pipeline allows you to sequentially apply a list of transformers to preprocess the data.
Here, we will create two pipelines: one for numerical features and one for categorical features.
"""

#Numerical pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),         #Impute missing values
    ("scaler", StandardScaler())                         #Standardize numerical features
])

#Categorical pipeline
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  #Impute missing values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))    #One-hot encode categorical features
])

#Combine with ColumnTransformer to create one function
preprocessor = ColumnTransformer([
("num", num_pipeline, num_columns),
("cat", cat_pipeline, cat_columns)
])

In [26]:
#Execute the work
X_processed = preprocessor.fit_transform(X)

In [27]:
#Get feature names from ColumnTransformer
num_features = num_columns
cat_features = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(cat_columns)

#Combine them
all_features = list(num_features) + list(cat_features)

#Convert X_processed back into a DataFrame
X_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
                    columns=all_features)

#Attach target column back
X_df["income"] = y.values

X_df.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
0,-0.408756,0.080051,1.133702,-0.145715,-0.217998,0.77946,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
1,-0.188857,-0.981653,0.357049,-0.145715,4.457168,0.77946,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
2,1.423734,0.126197,-1.97291,-0.145715,-0.217998,-0.03151,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
3,-1.288351,-0.090935,0.357049,-0.145715,-0.217998,0.455072,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
4,-0.848554,0.856334,-0.031277,-0.145715,-0.217998,-0.03151,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K


In [28]:
#Save the processed data
X_df.to_csv('./data/processed_adult_data.csv', index=False)

# Validation

In [29]:
#Separate features and labels and identify categorical and numerical features
cat_columns = validation_data.select_dtypes(include=['object']).columns
num_columns = validation_data.select_dtypes(include=['int64', 'float64']).columns

In [30]:
#Create sklearn pipelines for preprocessing
"""
Pipeline allows you to sequentially apply a list of transformers to preprocess the data.
Here, we will create two pipelines: one for numerical features and one for categorical features.
"""

#Numerical pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),         #Impute missing values
    ("scaler", StandardScaler())                         #Standardize numerical features
])

#Categorical pipeline
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  #Impute missing values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))    #One-hot encode categorical features
])

#Combine with ColumnTransformer to create one function
preprocessor = ColumnTransformer([
("num", num_pipeline, num_columns),
("cat", cat_pipeline, cat_columns)
])

In [31]:
#Execute the work
X2_processed = preprocessor.fit_transform(validation_data)

In [None]:
#Get feature names from ColumnTransformer
num_features = num_columns
cat_features = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(cat_columns)

#Combine them
all_features = list(num_features) + list(cat_features)

#Convert X_processed back into a DataFrame
X2_df = pd.DataFrame(X2_processed.toarray() if hasattr(X2_processed, "toarray") else X2_processed,
                    columns=all_features)

X2_df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,-0.851191,-0.277853,-0.031693,-0.147225,-0.211274,-0.212164,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.469374,-1.30409,-0.421896,-0.147225,-0.211274,-0.051009,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.704461,-0.036955,1.138915,-0.147225,-0.211274,1.157652,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.631097,0.001731,1.138915,-0.147225,-0.211274,-0.051009,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.704461,-0.004766,-0.031693,0.128973,-0.211274,0.754765,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
#Save the processed data
X2_df.to_csv('./data/processed_validation_inputs.csv', index=False)