In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom LabelEncoder Transformer
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = []
    def fit(self, X, y=None):
        # Fit a LabelEncoder for each categorical column
        self.encoders = [LabelEncoder().fit(X[:, i]) for i in range(X.shape[1])] # Assuming X is 2D
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for i, encoder in enumerate(self.encoders):
            X_transformed[:, i] = encoder.transform(X[:, i])
        return X_transformed

def load_data(path):
    return pd.read_csv(path)

def build_pipeline(numerical, categorical, scaling_method="standard", imputer_type="knn"):
    scaler = StandardScaler() if scaling_method == "standard" else MinMaxScaler()
    transformers = []

    # Numerical preprocessing pipeline
    if numerical:
        num_imputer = KNNImputer() if imputer_type == "knn" else SimpleImputer(strategy='mean')
        num_pipeline = Pipeline([('imputer', num_imputer), ('scaler', scaler)])
        transformers.append(('num', num_pipeline, numerical))
    
    # Categorical preprocessing pipeline with Label Encoding
    if categorical:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        cat_pipeline = Pipeline([('imputer', cat_imputer), ('encoder', LabelEncoderTransformer())])
        transformers.append(('cat', cat_pipeline, categorical))

    return ColumnTransformer(transformers)

def run_preprocessing(csv_path, output_path="./Processed_datasets/mall_customer_segmentation/processed_data.csv", scaling_method="standard", imputer_type="knn"):
    df = load_data(csv_path)
    print(f"✅ Loaded dataset with shape: {df.shape}")

    # Drop high-uniqueness columns (likely ID columns)
    df = df.loc[:, df.nunique() / len(df) < 1]

    # Identify numerical and categorical columns
    numerical, categorical = df.select_dtypes(include=['int64', 'float64']).columns.tolist(), df.select_dtypes(include=['object', 'category']).columns.tolist()
    print(f"Numerical columns: {numerical}")
    print(f"Categorical columns: {categorical}")

    # Build preprocessing pipeline
    pipeline = build_pipeline(numerical, categorical, scaling_method, imputer_type)
    processed = pipeline.fit_transform(df)

    # Reconstruct column names
    feature_names = list(numerical) + list(categorical)  # For label encoding, keep original column names
    processed_df = pd.DataFrame(processed, columns=feature_names)
    
    # Save processed data
    processed_df.to_csv(output_path, index=False)
    print(f"📁 Processed data saved to: {output_path}")

    return processed_df


In [6]:
pd.read_csv("..\Datasets\mall_customer_segmentation\Mall_Customers.csv").head() 

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [15]:
run_preprocessing(
    csv_path="../Datasets/mall_customer_segmentation/Mall_Customers.csv",
    output_path="../Processed_datasets/processed_data.csv",
    scaling_method="standard",
    imputer_type="knn"
)

✅ Loaded dataset with shape: (200, 5)
Numerical columns: ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
Categorical columns: ['Gender']
📁 Processed data saved to: ../Processed_datasets/processed_data.csv


Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender
0,-1.424569,-1.738999,-0.434801,1
1,-1.281035,-1.738999,1.195704,1
2,-1.352802,-1.70083,-1.715913,0
3,-1.137502,-1.70083,1.040418,0
4,-0.563369,-1.66266,-0.39598,0
...,...,...,...,...
195,-0.276302,2.268791,1.118061,0
196,0.441365,2.497807,-0.861839,0
197,-0.491602,2.497807,0.923953,1
198,-0.491602,2.917671,-1.250054,1


In [16]:
run_preprocessing(
    csv_path="../Datasets/utomobile_customer_segmentation/Train.csv",
    output_path="../Processed_datasets//utomobile_customer_segmentation/train_processed_data.csv",
    scaling_method="standard",
    imputer_type="knn"
)

✅ Loaded dataset with shape: (8068, 11)
Numerical columns: ['Age', 'Work_Experience', 'Family_Size']
Categorical columns: ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1', 'Segmentation']
📁 Processed data saved to: ../Processed_datasets//utomobile_customer_segmentation/train_processed_data.csv


Unnamed: 0,Age,Work_Experience,Family_Size,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1,Segmentation
0,-1.284623,-0.499892,0.757947,1,0,0,5,2,3,3
1,-0.327151,0.537954,0.096095,0,1,1,2,0,3,0
2,1.408268,-0.499892,-1.22761,0,1,1,2,2,5,1
3,1.408268,-0.805141,-0.565758,1,1,1,7,1,5,1
4,-0.207467,1.4537,2.081652,0,1,1,3,1,5,0
...,...,...,...,...,...,...,...,...,...,...
8063,-1.284623,-0.805141,2.743505,1,0,0,0,2,0,3
8064,-0.506677,0.110605,0.757947,1,0,0,4,2,3,3
8065,-0.626361,-0.499892,-1.22761,0,0,1,5,2,5,3
8066,-0.985413,-0.499892,0.757947,0,0,1,5,2,5,1


In [17]:
run_preprocessing(
    csv_path="../Datasets/utomobile_customer_segmentation/Test.csv",
    output_path="../Processed_datasets//utomobile_customer_segmentation/test_processed_data.csv",
    scaling_method="standard",
    imputer_type="knn"
)

✅ Loaded dataset with shape: (2627, 10)
Numerical columns: ['Age', 'Work_Experience', 'Family_Size']
Categorical columns: ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
📁 Processed data saved to: ../Processed_datasets//utomobile_customer_segmentation/test_processed_data.csv


Unnamed: 0,Age,Work_Experience,Family_Size,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1
0,-0.450948,-0.792867,-1.196222,0,1,1,2,2,5
1,-0.391999,1.696957,0.767498,1,1,1,5,0,5
2,1.494372,-0.792867,-1.196222,0,1,0,0,2,5
3,0.904881,2.630641,-0.541649,1,1,0,4,1,5
4,-1.453083,-0.04592,0.767498,0,0,0,8,2,5
...,...,...,...,...,...,...,...,...,...
2622,-0.863592,2.008185,0.767498,1,0,0,5,2,5
2623,-0.509897,-0.481639,-1.196222,0,0,1,1,2,5
2624,0.551187,0.51429,-0.541649,0,0,1,3,2,5
2625,0.197492,-0.481639,1.422071,1,1,1,4,1,3
