# 02 â€” Data Cleaning & Preprocessing

Build a leakage-safe preprocessing pipeline.

In [None]:
# If running on Colab and you keep data on Drive, you can mount Drive:
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
data_full = PROJECT_ROOT / "data" / "WA_Fn-UseC_-Telco-Customer-Churn.csv"
data_sample = PROJECT_ROOT / "data" / "sample_telco.csv"

DATA_PATH = data_full if data_full.exists() else data_sample
df = pd.read_csv(DATA_PATH)

print("Loaded:", DATA_PATH)
print("Shape:", df.shape)
df.head()


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

df_clean = df.copy()

df_clean["TotalCharges"] = pd.to_numeric(df_clean["TotalCharges"], errors="coerce")
if "customerID" in df_clean.columns:
    df_clean = df_clean.drop(columns=["customerID"])

df_clean["Churn"] = df_clean["Churn"].map({"No": 0, "Yes": 1}).astype(int)

# Feature engineering (simple, adds value + helps BI insights)
df_clean["avg_monthly_spend"] = df_clean["TotalCharges"] / df_clean["tenure"].replace(0, np.nan)

df_clean["tenure_bucket"] = pd.cut(
    df_clean["tenure"],
    bins=[-1, 6, 12, 24, 48, 72],
    labels=["0-6", "7-12", "13-24", "25-48", "49-72"]
)

X = df_clean.drop(columns=["Churn"])
y = df_clean["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "bool", "category"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Numeric cols:", len(numeric_cols), "Categorical cols:", len(categorical_cols))


In [None]:
import joblib
from pathlib import Path

OUT = Path("..") / "outputs"
OUT.mkdir(exist_ok=True)

joblib.dump((X_train, X_test, y_train, y_test), OUT / "data_splits.joblib")
joblib.dump(preprocessor, OUT / "preprocessor.joblib")

print("Saved splits and preprocessor to outputs/")
