In [1]:
import pandas as pd
import numpy as np

from pipeline_data_preprocessing import DataPreprocessingPipeline

In [2]:
np.random.seed(42)
n = 200

data = {
    'customer_id': [f"CUST{i:04d}" for i in range(n)],
    'age': np.random.normal(35, 10, n).astype(int),
    'income': np.random.normal(50000, 15000, n),
    'purchase_amount': np.random.exponential(300, n),
    'gender': np.random.choice(['Male', 'Female'], n),
    'membership': np.random.choice(['Silver', 'Gold', 'Platinum'], n),
    'signup_date': pd.date_range('2022-01-01', periods=n, freq='D'),
    'churned': np.random.choice([0, 1], n)
}

df = pd.DataFrame(data)

for col in ['age', 'income', 'gender']:
    df.loc[df.sample(frac=0.1).index, col] = np.nan
df.loc[df.sample(frac=0.05).index, 'income'] *= 5
df.to_csv("synthetic_customer_data.csv", index=False)

In [3]:
dpp = DataPreprocessingPipeline("synthetic_customer_data.csv")
dpp.missing_data_summary()

  pd.to_datetime(df[col])
  pd.to_datetime(df[col])


Unnamed: 0,Total,Percent,Types
age,20,10.0,float64
income,20,10.0,float64
purchase_amount,0,0.0,float64
gender,20,10.0,object
membership,0,0.0,object
churned,0,0.0,int64
signup_date_year,0,0.0,int32
signup_date_month,0,0.0,int32
signup_date_day,0,0.0,int32
signup_date_dayofweek,0,0.0,int32


In [4]:
dpp.imputation_strategy()

In [5]:
dpp.handle_outlier('iqr')

In [6]:
dpp.handle_scaling('standard')

In [7]:
dpp.feature_selection()

In [8]:
dpp.data

Unnamed: 0,age,income,purchase_amount,churned,signup_date_month,signup_date_day,signup_date_dayofweek,gender,membership
0,0.007538,0.048940,-0.401530,1.083473,-1.477564,-1.628958,0.993808,Female,Gold
1,-0.094906,0.223402,-0.691393,-0.922958,-1.477564,-1.513183,1.490712,Male,Platinum
2,0.832894,0.672254,-1.041406,1.083473,-1.477564,-1.397407,-1.490712,,Gold
3,1.876668,0.647116,1.559835,1.083473,-1.477564,-1.281632,-0.993808,Male,Platinum
4,-0.210881,0.616539,1.125524,1.083473,-1.477564,-1.165857,-0.496904,Female,Gold
...,...,...,...,...,...,...,...,...,...
195,0.484969,-0.661776,-0.851336,1.083473,1.644050,-0.008104,0.496904,Female,Silver
196,-0.906731,-1.730872,-1.026965,1.083473,1.644050,0.107671,0.993808,Male,Platinum
197,0.253019,2.652473,0.690657,1.083473,1.644050,0.223446,1.490712,Female,Gold
198,0.137044,-0.356992,0.025125,-0.922958,1.644050,0.339221,-1.490712,Male,Silver


In [9]:
dpp.get_report()

{'original_shape': (200, 8),
 'missing_values_before': 60,
 'column_types': {'numeric': ['age',
   'income',
   'purchase_amount',
   'churned',
   'signup_date_year',
   'signup_date_month',
   'signup_date_day',
   'signup_date_dayofweek'],
  'categorical': ['gender', 'membership'],
  'datetime': ['signup_date']},
 'transformations': ["Dropped likely ID columns: ['customer_id']",
  'Processed datetime column: signup_date',
  'Imputed missing values using mean',
  'Handled outliers using iqr',
  'Scaled features using standard',
  'Performed feature selection'],
 'removed_features': ['customer_id', 'signup_date_year'],
 'feature_importances': {},
 'final_shape': (200, 9),
 'missing_values_after': 20,
 'report_timestamp': '2025-05-08 17:01:39'}