In [1]:
import pandas as pd
import numpy as np

from pipeline_data_preprocessing import DataPreprocessor

In [2]:
np.random.seed(42)

# Generate sample dataset
n_samples = 1000
sample_data = {
    'customer_id': range(1, n_samples + 1),
    'age': np.random.normal(35, 10, n_samples),
    'income': np.random.normal(50000, 15000, n_samples),
    'score': np.random.uniform(0, 100, n_samples),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
    'city': np.random.choice(['New York', 'London', 'Tokyo', 'Sydney', 'Paris'], n_samples),
    'date_joined': pd.date_range('2020-01-01', periods=n_samples, freq='D'),
    'target': np.random.choice([0, 1], n_samples)
}

# Add some missing values and outliers
sample_df = pd.DataFrame(sample_data)
sample_df.loc[50:60, 'income'] = np.nan
sample_df.loc[100:110, 'age'] = np.nan
sample_df.loc[200:205, 'category'] = np.nan
sample_df.loc[20:25, 'income'] = 200000  # outliers

print("=== Sample Dataset ===")
sample_df

=== Sample Dataset ===


Unnamed: 0,customer_id,age,income,score,category,city,date_joined,target
0,1,39.967142,70990.331549,40.710649,C,Paris,2020-01-01,0
1,2,33.617357,63869.505244,6.600984,C,Tokyo,2020-01-02,0
2,3,41.476885,50894.455549,34.882053,D,New York,2020-01-03,0
3,4,50.230299,40295.948334,11.099810,B,Tokyo,2020-01-04,0
4,5,32.658466,60473.349704,80.823521,D,Paris,2020-01-05,1
...,...,...,...,...,...,...,...,...
995,996,32.188997,66052.253575,34.264515,B,Tokyo,2022-09-22,1
996,997,52.976865,49602.181111,86.465892,D,Paris,2022-09-23,1
997,998,41.408429,36771.880232,15.492531,D,London,2022-09-24,0
998,999,29.288210,47553.995541,8.273684,A,New York,2022-09-25,1


In [9]:
print(f"Shape: {sample_df.shape}")
print(f"Missing values:\n{sample_df.isnull().sum()}")
sample_df.head()

Shape: (1000, 8)
Missing values:
customer_id     0
age            11
income         11
score           0
category        6
city            0
date_joined     0
target          0
dtype: int64


Unnamed: 0,customer_id,age,income,score,category,city,date_joined,target
0,1,39.967142,70990.331549,40.710649,C,Paris,2020-01-01,0
1,2,33.617357,63869.505244,6.600984,C,Tokyo,2020-01-02,0
2,3,41.476885,50894.455549,34.882053,D,New York,2020-01-03,0
3,4,50.230299,40295.948334,11.09981,B,Tokyo,2020-01-04,0
4,5,32.658466,60473.349704,80.823521,D,Paris,2020-01-05,1


In [4]:
# Example 1: Quick preprocessing with specific target

print("\n" + "="*50)
print("EXAMPLE 1: Quick preprocessing with target column")
print("="*50)

preprocessor1 = DataPreprocessor(sample_df.copy(), target_column='target')
processed_data1 = preprocessor1.quick_preprocess()

print(f"\nProcessed shape: {processed_data1.shape}")
print(f"Final columns: {list(processed_data1.columns)}")


EXAMPLE 1: Quick preprocessing with target column
[19:04:58] Data loaded from DataFrame
[19:04:58] Dataset shape: (1000, 8)
[19:04:58] Numeric features: 0
[19:04:58] Categorical features: 2
[19:04:58] Target column(s): ['target']
[19:04:58] Missing values: 28
[19:04:58] Starting quick preprocessing pipeline...
[19:04:58] Removed ID columns: ['customer_id', 'age', 'income', 'score', 'date_joined']
[19:04:58] Shape changed from (1000, 8) to (1000, 3)
[19:04:58] Applied most_frequent imputation to categorical features
[19:04:58] No numeric features found for outlier handling
[19:04:58] Encoded categorical features using auto
[19:04:58] No numeric features found for scaling
[19:04:58] Quick preprocessing completed!

Processed shape: (1000, 8)
Final columns: ['target', 'category_B', 'category_C', 'category_D', 'city_New York', 'city_Paris', 'city_Sydney', 'city_Tokyo']


  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',


In [None]:
# Example 2: Step-by-step preprocessing

print("\n" + "="*50)
print("EXAMPLE 2: Step-by-step preprocessing")
print("="*50)

preprocessor2 = DataPreprocessor(sample_df.copy(), target_column='target')
processed_data2 = (preprocessor2
                    .clean_data()
                    .handle_missing_values('median')
                    .handle_outliers('iqr')
                    .encode_categorical('onehot')
                    .scale_features('standard')
                    .get_processed_data())

print(f"\nProcessed shape: {processed_data2.shape}")
print(f"Missing values after processing: {processed_data2.isnull().sum().sum()}")


EXAMPLE 2: Step-by-step preprocessing
[19:04:58] Data loaded from DataFrame
[19:04:58] Dataset shape: (1000, 8)
[19:04:58] Numeric features: 0
[19:04:58] Categorical features: 2
[19:04:58] Target column(s): ['target']
[19:04:58] Missing values: 28
[19:04:58] Removed ID columns: ['customer_id', 'age', 'income', 'score', 'date_joined']
[19:04:58] Shape changed from (1000, 8) to (1000, 3)
[19:04:58] Applied most_frequent imputation to categorical features
[19:04:58] No numeric features found for outlier handling
[19:04:58] Encoded categorical features using onehot
[19:04:58] No numeric features found for scaling

Processed shape: (1000, 8)
Missing values after processing: 0


  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',


In [6]:
# Example 3: Process all columns

print("\n" + "="*50)
print("EXAMPLE 3: Process all columns (no target exclusion)")
print("="*50)

preprocessor3 = DataPreprocessor(sample_df.copy(), target_column="all")
processed_data3 = preprocessor3.quick_preprocess()

print(f"\nProcessed shape: {processed_data3.shape}")



EXAMPLE 3: Process all columns (no target exclusion)
[19:04:58] Data loaded from DataFrame
[19:04:58] Dataset shape: (1000, 8)
[19:04:58] Numeric features: 0
[19:04:58] Categorical features: 0
[19:04:58] Target column(s): ['customer_id', 'age', 'income', 'score', 'category', 'city', 'date_joined', 'target']
[19:04:58] Missing values: 28
[19:04:58] Starting quick preprocessing pipeline...
[19:04:58] No numeric features found for outlier handling
[19:04:58] No categorical features found
[19:04:58] No numeric features found for scaling
[19:04:58] Quick preprocessing completed!

Processed shape: (1000, 8)


In [None]:
# Example 4: Custom strategies

print("\n" + "="*50)
print("EXAMPLE 4: Custom preprocessing strategies")
print("="*50)

preprocessor4 = DataPreprocessor(sample_df.copy(), target_column=['target', 'score'])
processed_data4 = (preprocessor4
                    .clean_data(remove_id_columns=True)
                    .handle_missing_values({'age': 'median', 'income': 'mean', 'category': 'most_frequent'})
                    .handle_outliers('isolation_forest', contamination=0.05)
                    .encode_categorical('auto', max_categories=5)
                    .scale_features('robust')
                    .remove_low_variance(threshold=0.01)
                    .get_processed_data())

print(f"\nProcessed shape: {processed_data4.shape}")


EXAMPLE 4: Custom preprocessing strategies
[19:04:58] Data loaded from DataFrame
[19:04:58] Dataset shape: (1000, 8)
[19:04:58] Numeric features: 0
[19:04:58] Categorical features: 2
[19:04:58] Target column(s): ['target', 'score']
[19:04:58] Missing values: 28
[19:04:58] Removed ID columns: ['customer_id', 'age', 'income', 'date_joined']
[19:04:58] Shape changed from (1000, 8) to (1000, 4)
[19:04:58] Applied custom imputation strategies: {'age': 'median', 'income': 'mean', 'category': 'most_frequent'}
[19:04:58] No numeric features found for outlier handling
[19:04:58] Encoded categorical features using auto
[19:04:58] No numeric features found for scaling

Processed shape: (1000, 9)


  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',
  pd.to_datetime(sample, errors='raise',


In [8]:
# Show summary
print("\n" + "="*50)
print("PREPROCESSING SUMMARY")
print("="*50)
summary = preprocessor4.get_summary()
print(f"Original shape: {summary['original_shape']}")
print(f"Final shape: {summary['final_shape']}")
print(f"Processing steps: {summary['processing_steps']}")
print(f"Numeric features: {summary['numeric_features']}")
print(f"Categorical features: {summary['categorical_features']}")

print(f"\nProcessing log (last 5 steps):")
for step in summary['processing_log'][-5:]:
    print(f"  {step}")


PREPROCESSING SUMMARY
Original shape: (1000, 8)
Final shape: (1000, 9)
Processing steps: 12
Numeric features: 0
Categorical features: 7

Processing log (last 5 steps):
  [19:04:58] Shape changed from (1000, 8) to (1000, 4)
  [19:04:58] Applied custom imputation strategies: {'age': 'median', 'income': 'mean', 'category': 'most_frequent'}
  [19:04:58] No numeric features found for outlier handling
  [19:04:58] Encoded categorical features using auto
  [19:04:58] No numeric features found for scaling
