In [1]:
import pandas as pd
import numpy as np

from pipeline_data_preprocessing import DataPreprocessingPipeline

In [2]:
np.random.seed(42)
n = 200

data = {
    'customer_id': [f"CUST{i:04d}" for i in range(n)],
    'age': np.random.normal(35, 10, n).astype(int),
    'income': np.random.normal(50000, 15000, n),
    'purchase_amount': np.random.exponential(300, n),
    'gender': np.random.choice(['Male', 'Female'], n),
    'membership': np.random.choice(['Silver', 'Gold', 'Platinum'], n),
    'signup_date': pd.date_range('2022-01-01', periods=n, freq='D'),
    'churned': np.random.choice([0, 1], n)
}

df = pd.DataFrame(data)

for col in ['age', 'income', 'gender']:
    df.loc[df.sample(frac=0.1).index, col] = np.nan
df.loc[df.sample(frac=0.05).index, 'income'] *= 5
df.to_csv("synthetic_customer_data.csv", index=False)

In [3]:
dpp = DataPreprocessingPipeline("synthetic_customer_data.csv")
dpp.missing_data_summary()

  pd.to_datetime(df[col])
  pd.to_datetime(df[col])


Unnamed: 0,Total,Percent,Types
age,20,10.0,float64
income,20,10.0,float64
purchase_amount,0,0.0,float64
gender,20,10.0,object
membership,0,0.0,object
churned,0,0.0,int64
signup_date_year,0,0.0,int32
signup_date_month,0,0.0,int32
signup_date_day,0,0.0,int32
signup_date_dayofweek,0,0.0,int32


In [4]:
dpp.imputation_strategy()

In [5]:
dpp.handle_outlier('iqr')

In [6]:
dpp.handle_scaling('standard')

In [7]:
dpp.feature_selection()

ValueError: could not convert string to float: 'Female'

In [None]:
dpp.data

In [None]:
dpp.get_report()