In [36]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


In [38]:
# Step 1: Load and clean base data
# ---------------------------------
df = pd.read_csv("dataset/Telco-Customer-Churn.csv")
df.drop("customerID", axis=1, inplace=True)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].replace(" ", np.nan), errors="coerce")
df["SeniorCitizen"] = df["SeniorCitizen"].map({1: "Yes", 0: "No"})
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})


In [39]:
# Step 2: Feature Engineering manually (outside pipeline)
# ---------------------------------
service_cols = ["PhoneService", "MultipleLines", "InternetService", "OnlineSecurity",
                "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]

df["ServicesCount"] = df[service_cols].apply(lambda x: (x == "Yes").sum(), axis=1)

df["TenureGroup"] = pd.cut(df["tenure"],
                           bins=[0, 6, 12, 24, 48, 72],
                           labels=["0-6 Mo", "6-12 Mo", "1-2 Yr", "2-4 Yr", "4-6 Yr"])


In [40]:
# Step 3: Define preprocessing pipeline
# ---------------------------------
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'InternetService',
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                    'PaymentMethod', 'TenureGroup']

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor)
])

In [41]:
# Step 4: Fit-transform the pipeline
# ---------------------------------
X = df[numerical_cols + categorical_cols]
y = df["Churn"]

X_transformed = pipeline.fit_transform(X)

In [44]:
# Step 5: Assemble final DataFrame
# ---------------------------------
cat_feature_names = pipeline.named_steps["preprocessor"] \
                            .named_transformers_["cat"] \
                            .named_steps["encoder"] \
                            .get_feature_names_out(categorical_cols)

final_feature_names = numerical_cols + list(cat_feature_names)
df_processed = pd.DataFrame(X_transformed, columns=final_feature_names)

# Add back manual features
df_processed["ServicesCount"] = df["ServicesCount"].values
df_processed["Churn"] = y.values
# Add unified readable TenureGroup column back
df_processed["TenureGroup"] = df["TenureGroup"].values

# Optional: Reorder columns for readability
cols = df_processed.columns.tolist()
cols_reordered = cols.copy()
# Move 'TenureGroup' next to 'tenure' and before churn
cols_reordered.remove("TenureGroup")
cols_reordered.insert(cols_reordered.index("tenure") + 1, "TenureGroup")
df_processed = df_processed[cols_reordered]

In [45]:
# Save to file
df_processed.to_csv("telecom_advanced_clean.csv", index=False)
print("✅ Final cleaned file created: telecom_advanced_clean.csv (with unified TenureGroup)")

✅ Final cleaned file created: telecom_advanced_clean.csv (with unified TenureGroup)
