In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
# Load the dataset
df = pd.read_csv("sample_data_100.csv")
print("Raw Data:\n", df.head())

Raw Data:
    Name   Age  Gender   Salary         City
0   Bob  48.0  Female  30055.0       Dallas
1   Tom  56.0  Female      NaN       Denver
2   Bob  57.0    Male  58699.0  Los Angeles
3  John  38.0    Male  97661.0       Boston
4  John  27.0    Male  74247.0      Chicago


In [3]:

# Define numerical and categorical features
numerical_features = ["Age", "Salary"]
categorical_features = ["Gender", "City"]

In [4]:
# Define transformation pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # Fill missing values with mean
    ("scaler", StandardScaler())  # Standardize numerical data
])

In [5]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # One-hot encode categorical features
])

In [6]:
# Combine transformers
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

In [7]:
# Apply transformations
processed_data = preprocessor.fit_transform(df)