In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', np.nan],
    'Age': [25, np.nan, 35, 45, 29],
    'Gender': ['F', 'M', 'M', 'M', 'F'],
    'Income': [50000, 60000, np.nan, 80000, 72000],
    'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Chicago']
}
df_raw = pd.DataFrame(data)
print(" *Raw Data *:")
print(df_raw)
num_cols = ['Age', 'Income']
cat_cols = ['Gender', 'City']
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])
df_features = df_raw.drop(columns=['Name'])
processed_data = preprocessor.fit_transform(df_features)
cat_features = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_cols)
final_columns = num_cols + list(cat_features)
df_processed = pd.DataFrame(processed_data, columns=final_columns)

print("\n *Transformed Data *:")
print(df_processed)
df_processed.to_csv("processed_data.csv", index=False)
print("\n * Processed data saved to 'processed_data.csv *'")


Raw Data:
      Name   Age Gender   Income         City
0    Alice  25.0      F  50000.0     New York
1      Bob   NaN      M  60000.0  Los Angeles
2  Charlie  35.0      M      NaN     New York
3    David  45.0      M  80000.0      Chicago
4      NaN  29.0      F  72000.0      Chicago

 Transformed Data:
        Age    Income  Gender_F  Gender_M  City_Chicago  City_Los Angeles  \
0 -1.261511 -1.515535       1.0       0.0           0.0               0.0   
1  0.000000 -0.537770       0.0       1.0           0.0               1.0   
2  0.222620  0.000000       0.0       1.0           0.0               0.0   
3  1.706750  1.417758       0.0       1.0           1.0               0.0   
4 -0.667859  0.635547       1.0       0.0           1.0               0.0   

   City_New York  
0            1.0  
1            0.0  
2            1.0  
3            0.0  
4            0.0  

 Processed data saved to 'processed_data.csv'
