In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
def preprocess_data(file_path='data.csv'):
    df = pd.read_csv(file_path)

    print("--- Original Raw Dataset ---")
    print(df.head())
    print("\nMissing values before processing:")
    print(df.isnull().sum())

    X = df.drop('purchased', axis=1)
    y = df['purchased'].apply(lambda x: 1 if x == 'Yes' else 0)

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    print(f"\nNumerical features: {numerical_features}")
    print(f"Categorical features: {categorical_features}")

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"\nDataset split into {X_train.shape[0]} training and {X_test.shape[0]} testing samples.")

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
    all_feature_names = numerical_features + list(ohe_feature_names)

    X_train_processed_df = pd.DataFrame(X_train_processed, columns=all_feature_names, index=X_train.index)
    X_test_processed_df = pd.DataFrame(X_test_processed, columns=all_feature_names, index=X_test.index)

    print("\n--- Processed Training Features (first 5 rows) ---")
    print(X_train_processed_df.head())

    print("\n--- Processed Testing Features ---")
    print(X_test_processed_df)

    print("\n--- Shape of Processed Datasets ---")
    print(f"Processed Training Features Shape: {X_train_processed_df.shape}")
    print(f"Processed Testing Features Shape:  {X_test_processed_df.shape}")
    print(f"Training Target Shape:             {y_train.shape}")
    print(f"Testing Target Shape:              {y_test.shape}")

    return X_train_processed_df, X_test_processed_df, y_train, y_test

In [None]:
if __name__ == '__main__':
    preprocess_data('data.csv')