In [None]:
"""
clean_data_pipeline.py
A functional pipeline demonstrating data-cleaning steps.
"""

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
# ---------------------------------------------------------
# Data Creation
# ---------------------------------------------------------
def create_raw_data() -> pd.DataFrame:
    raw_data = {
        "CustomerID": [1, 2, 2, 3, 4, 5, 6, 7],
        "Name": ["Alice", "Bob", "bob", "Charlie", None, "Eve", "Frank", "Grace"],
        "Age": [25, 30, np.nan, 40, 200, 22, 28, None],
        "SignupDate": ["2024-01-01", "01/15/2024", "01/15/2024", "2024-02-20",
                       "2024-02-30", "2024-03-05", None, "2024-03-15"],
        "City": ["NY", "nyc", "NYC", "New York", "LA", "Los Angeles", "LA ", " SF"],
        "Income": [50000, 60000, 60000, 80000, 1200000, 45000, None, 70000],
        "Purchased": ["Yes", "No", "no", "Yes", "Yes", "No", "Yes", "Yes"]
    }
    return pd.DataFrame(raw_data)

In [None]:
# ---------------------------------------------------------
# Cleaning Functions
# ---------------------------------------------------------
def handle_missing(df: pd.DataFrame) -> pd.DataFrame:
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Income"].fillna(df["Income"].mean(), inplace=True)
    df["Name"].fillna("Unknown", inplace=True)
    return df

In [None]:
def correct_types(df: pd.DataFrame) -> pd.DataFrame:
    df["SignupDate"] = pd.to_datetime(df["SignupDate"], errors="coerce")
    return df

In [None]:
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates()

In [None]:
def standardize_strings(df: pd.DataFrame) -> pd.DataFrame:
    df["Name"] = df["Name"].str.title()
    df["City"] = df["City"].str.strip().str.lower()
    city_map = {
        "ny": "New York", "nyc": "New York", "new york": "New York",
        "la": "Los Angeles", "los angeles": "Los Angeles", "sf": "San Francisco"
    }
    df["City"] = df["City"].replace(city_map)
    df["Purchased"] = df["Purchased"].str.lower().map({"yes": 1, "no": 0})
    return df

In [None]:
def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
    # Cap unreasonable ages
    df.loc[df["Age"] > 100, "Age"] = df["Age"].median()
    # Cap extreme incomes at 99th percentile
    upper_cap = df["Income"].quantile(0.99)
    df["Income"] = np.where(df["Income"] > upper_cap, upper_cap, df["Income"])
    return df

In [None]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    today = pd.Timestamp("2025-09-28")
    df["DaysSinceSignup"] = (today - df["SignupDate"]).dt.days
    return df

In [None]:
# ---------------------------------------------------------
# Preprocessing Pipeline for Modeling
# ---------------------------------------------------------
def build_preprocessor(numeric_features, categorical_features) -> ColumnTransformer:
    numeric_transformer = Pipeline([("scaler", StandardScaler())])
    categorical_transformer = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))])

    return ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

In [None]:
# ---------------------------------------------------------
# Main Cleaning Orchestrator
# ---------------------------------------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = handle_missing(df)
    df = correct_types(df)
    df = remove_duplicates(df)
    df = standardize_strings(df)
    df = handle_outliers(df)
    df = engineer_features(df)
    return df

In [None]:
# ---------------------------------------------------------
# Demo Execution
# ---------------------------------------------------------
def main():
    df = create_raw_data()
    print("\n==== RAW DATA ====")
    print(df)

    df_clean = clean_data(df)
    print("\n==== CLEANED DATA ====")
    print(df_clean)

    # Example: prepping for modeling
    X = df_clean.drop(columns=["Purchased", "CustomerID", "Name", "SignupDate"])
    y = df_clean["Purchased"]

    numeric_features = ["Age", "Income", "DaysSinceSignup"]
    categorical_features = ["City"]

    preprocessor = build_preprocessor(numeric_features, categorical_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_ready = preprocessor.fit_transform(X_train)

    print("\nTransformed training set shape:", X_train_ready.shape)

In [None]:
if __name__ == "__main__":
    main()