# 05 â€” Export Predictions for Power BI

Train a selected model and export churn probability + risk segments.

In [None]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

OUT = Path("..") / "outputs"
OUT.mkdir(exist_ok=True)

PROJECT_ROOT = Path("..").resolve()
data_full = PROJECT_ROOT / "data" / "WA_Fn-UseC_-Telco-Customer-Churn.csv"
data_sample = PROJECT_ROOT / "data" / "sample_telco.csv"
DATA_PATH = data_full if data_full.exists() else data_sample

df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH, " shape:", df.shape)

X_train, X_test, y_train, y_test = joblib.load(OUT / "data_splits.joblib")
preprocessor = joblib.load(OUT / "preprocessor.joblib")

# Train a model for scoring (default: Gradient Boosting)
model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline(steps=[("preprocess", preprocessor), ("model", model)])
pipe.fit(X_train, y_train)

# Keep customerID for Power BI join
customer_id = df["customerID"] if "customerID" in df.columns else pd.Series(range(len(df)))

# Prepare model input: same cleaning + feature engineering as preprocessing
df_model = df.copy()
df_model["TotalCharges"] = pd.to_numeric(df_model["TotalCharges"], errors="coerce")

if "customerID" in df_model.columns:
    df_model = df_model.drop(columns=["customerID"])
if "Churn" in df_model.columns:
    df_model = df_model.drop(columns=["Churn"])

df_model["avg_monthly_spend"] = df_model["TotalCharges"] / df_model["tenure"].replace(0, np.nan)
df_model["tenure_bucket"] = pd.cut(
    df_model["tenure"],
    bins=[-1, 6, 12, 24, 48, 72],
    labels=["0-6", "7-12", "13-24", "25-48", "49-72"]
)

churn_proba = pipe.predict_proba(df_model)[:, 1]

# Risk segmentation
def risk_segment(p):
    if p >= 0.70:
        return "High"
    if p >= 0.40:
        return "Medium"
    return "Low"

export_df = pd.DataFrame({
    "customerID": customer_id.astype(str),
    "churn_probability": churn_proba,
    "predicted_churn": (churn_proba >= 0.50).astype(int),
    "risk_segment": pd.Series(churn_proba).apply(risk_segment),
})

# Add slicer columns (if present)
for col in ["tenure", "MonthlyCharges", "TotalCharges", "Contract", "PaymentMethod", "InternetService"]:
    if col in df.columns:
        export_df[col] = df[col]

export_path = OUT / "predictions_for_powerbi.csv"
export_df.to_csv(export_path, index=False)
print("Saved:", export_path)
export_df.head()


In [None]:
# Save trained pipeline (optional)
model_path = OUT / "best_model_pipeline.joblib"
joblib.dump(pipe, model_path)
print("Saved model pipeline:", model_path)
