In [10]:
# !pip install xgboost lightgbm "mlflow<3"

In [17]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# 1. LOAD ENVIRONMENT & SETUP CONNECTION
# Adjust this path if your .env is in a different location
load_dotenv(dotenv_path="../.env") 

RENDER_URL = "postgresql://eas503_final_project_user:YTTEtGb7piQ9eBFoyxwesnWWhFYyq9Kc@dpg-d5256sv5r7bs739ugro0-a.ohio-postgres.render.com/eas503_final_project"
engine = create_engine(RENDER_URL)

# 2. SQL JOIN QUERY (Telco Version)
# This joins the 3 normalized tables back into a single dataframe
query = """
SELECT 
    c.*, 
    s.phoneservice, s.multiplelines, s.internetservice, 
    s.onlinesecurity, s.onlinebackup, s.deviceprotection, 
    s.techsupport, s.streamingtv, s.streamingmovies,
    k.tenure, k.contract, k.paperlessbilling, 
    k.paymentmethod, k.monthlycharges, k.totalcharges, k.churn
FROM customers AS c
JOIN services AS s ON c.customerid = s.customerid
JOIN contracts AS k ON c.customerid = k.customerid
ORDER BY c.customerid
"""

# 3. LOAD DATA INTO DATAFRAME
df = pd.read_sql(query, engine)

# 4. PRE-PROCESSING FOR CLASSIFICATION
# Map the target 'churn' to 1 (Yes) and 0 (No)
df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})

# 5. QUICK CHECK
print(f"✓ Data successfully loaded from Render. Shape: {df.shape}")
print(f"✓ Target 'churn' distribution:\n{df['churn'].value_counts()}")

df.head()

✓ Data successfully loaded from Render. Shape: (7043, 21)
✓ Target 'churn' distribution:
churn
0    5174
1    1869
Name: count, dtype: int64


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,...,techsupport,streamingtv,streamingmovies,tenure,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0002-ORFBO,Female,0,Yes,Yes,Yes,No,DSL,No,Yes,...,Yes,Yes,No,9,One year,Yes,Mailed check,65.6,593.3,0
1,0003-MKNFE,Male,0,No,No,Yes,Yes,DSL,No,No,...,No,No,Yes,9,Month-to-month,No,Mailed check,59.9,542.4,0
2,0004-TLHLJ,Male,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,4,Month-to-month,Yes,Electronic check,73.9,280.85,1
3,0011-IGKFF,Male,1,Yes,No,Yes,No,Fiber optic,No,Yes,...,No,Yes,Yes,13,Month-to-month,Yes,Electronic check,98.0,1237.85,1
4,0013-EXCHZ,Female,1,Yes,No,Yes,No,Fiber optic,No,No,...,Yes,Yes,No,3,Month-to-month,Yes,Mailed check,83.9,267.4,1


In [4]:
# !pip install "numpy<2.0"
# !pip install ipywidgets

In [18]:
from ydata_profiling import ProfileReport

# Generate the report from the joined dataframe (df)
profile = ProfileReport(df, title="Telco Customer Churn Profiling Report", explorative=True)

# Display the report directly in the notebook
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:00<00:00, 378.43it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
import os
import numpy as np
import pandas as pd
import time
from dotenv import load_dotenv

from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

import mlflow
from mlflow.models import infer_signature
import joblib

import sys
import importlib

# 1. Point to the folder where your pipeline lives (api or root)
api_path = os.path.abspath(os.path.join('..'))
if api_path not in sys.path:
    sys.path.append(api_path)

# 2. Import and reload the pipeline to ensure no old Housing columns are cached
import housing_pipeline
importlib.reload(housing_pipeline)
from housing_pipeline import build_preprocessing, make_estimator_for_name

print("✓ Pipeline successfully reloaded for Telco Churn.")

start_time = time.monotonic()

# =============================================================================
# STEP 1: Build Preprocessing
# =============================================================================
preprocessing = build_preprocessing()
print("✓ STEP 1: Preprocessing pipeline created.")

# =============================================================================
# STEP 2: Stratified Split (Telco Churn Version)
# =============================================================================
# We drop 'customerid' because it is just a unique identifier
df_clean = df.drop(columns=["customerid"], errors='ignore')

# Map 'churn' to 1 and 0 if not already done
if df_clean['churn'].dtype == object:
    df_clean['churn'] = df_clean['churn'].map({'Yes': 1, 'No': 0})

X = df_clean.drop("churn", axis=1)
y = df_clean["churn"]

# Stratify on 'churn' to handle the imbalance (~26% churn rate)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print(f"✓ STEP 2: Stratified split done. Train: {len(X_train)}, Test: {len(X_test)}")

# =============================================================================
# STEP 3: Define Classifiers (WITHOUT PCA)
# =============================================================================
models = {}
for name in ["ridge", "histgradientboosting", "xgboost", "lightgbm"]:
    est = make_estimator_for_name(name)
    models[name] = make_pipeline(preprocessing, est)

# =============================================================================
# STEP 4: Configure MLflow
# =============================================================================
load_dotenv(dotenv_path="../.env", override=True) 

mlflow.set_tracking_uri(os.getenv("DB_URL").split('@')[1].split('/')[0] if not os.getenv("MLFLOW_TRACKING_URI") else os.getenv("MLFLOW_TRACKING_URI"))
# Manually set if env doesn't have it
mlflow.set_experiment("telco_churn_baseline_experiments")

print("✓ STEP 4: MLflow configured.")

# =============================================================================
# STEP 5: Train & Log Baseline Models (NO PCA)
# =============================================================================
results = {}

for name, pipeline in models.items():
    print(f"\nTraining baseline classification: {name}")

    cv_scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    cv_f1 = cv_scores.mean()
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    
    print(f"{name} CV F1: {cv_f1:.4f} | Test F1: {test_f1:.4f}")

    results[name] = {"pipeline": pipeline, "test_f1": test_f1, "cv_f1": cv_f1}

    with mlflow.start_run(run_name=f"{name}_baseline"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.sklearn.log_model(pipeline, f"{name}_model")

# =============================================================================
# STEP 7: Train & Log PCA Versions
# =============================================================================
pca_results = {}

for name in models.keys():
    print(f"\nTraining PCA model: {name}")
    est = make_estimator_for_name(name)
    pca_pipeline = make_pipeline(preprocessing, PCA(n_components=0.95), est)

    cv_scores_pca = cross_val_score(pca_pipeline, X_train, y_train, cv=3, scoring="f1")
    cv_f1_pca = cv_scores_pca.mean()
    
    pca_pipeline.fit(X_train, y_train)
    test_f1_pca = f1_score(y_test, pca_pipeline.predict(X_test))

    model_key = f"{name}_with_pca"
    pca_results[model_key] = {"pipeline": pca_pipeline, "test_f1": test_f1_pca, "cv_f1": cv_f1_pca}
    
    with mlflow.start_run(run_name=model_key):
        mlflow.log_param("uses_pca", True)
        mlflow.log_metric("cv_f1", cv_f1_pca)
        mlflow.log_metric("test_f1", test_f1_pca)
        mlflow.sklearn.log_model(pca_pipeline, f"{model_key}_model")

# =============================================================================
# STEP 8: Choose GLOBAL Best
# =============================================================================
all_results = {**results, **pca_results}
global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
global_best_pipeline = all_results[global_best_name]["pipeline"]

print(f"\nGLOBAL BEST MODEL: {global_best_name} with Test F1: {all_results[global_best_name]['test_f1']:.4f}")

# Save the model
os.makedirs("../models", exist_ok=True)
joblib.dump(global_best_pipeline, "../models/global_best_model.pkl")

print(f"✓ Global best model saved successfully to ../models/global_best_model.pkl")

✓ Pipeline successfully reloaded for Telco Churn.
✓ STEP 1: Preprocessing pipeline created.
✓ STEP 2: Stratified split done. Train: 5634, Test: 1409


2025/12/18 15:08:43 INFO mlflow.tracking.fluent: Experiment with name 'telco_churn_baseline_experiments' does not exist. Creating a new experiment.


✓ STEP 4: MLflow configured.

Training baseline classification: ridge
ridge CV F1: 0.5693 | Test F1: 0.5491




🏃 View run ridge_baseline at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/89b54ed903de430c9cc312c73e5e6628
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training baseline classification: histgradientboosting
histgradientboosting CV F1: 0.5859 | Test F1: 0.5481




🏃 View run histgradientboosting_baseline at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/ae9b1061095f48298f3092f5eb86c4b2
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training baseline classification: xgboost
xgboost CV F1: 0.5560 | Test F1: 0.5481




🏃 View run xgboost_baseline at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/0e0fe7f305f64f9b80f047bbda3963da
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training baseline classification: lightgbm




lightgbm CV F1: 0.5759 | Test F1: 0.5570




🏃 View run lightgbm_baseline at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/df1e67f9698b4f01a06d4f7e22384069
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training PCA model: ridge




🏃 View run ridge_with_pca at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/dd4104c393d4460fa5daaf286a55bf28
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training PCA model: histgradientboosting




🏃 View run histgradientboosting_with_pca at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/9aa72ebcb3e9464bb1c56a03da8358e5
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training PCA model: xgboost




🏃 View run xgboost_with_pca at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/1ed54c3fd9654840b4502ea635b5033c
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

Training PCA model: lightgbm




🏃 View run lightgbm_with_pca at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2/runs/df318ce9c92d41af8af635d00e4c09fa
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/2

GLOBAL BEST MODEL: ridge_with_pca with Test F1: 0.5611
✓ Global best model saved successfully to ../models/global_best_model.pkl


In [16]:
import mlflow
import os

# --- CONFIGURE CREDENTIALS ---
# I am using the values from your screenshot and the token you pasted
os.environ["MLFLOW_TRACKING_USERNAME"] = "jinnn11"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "64db293764886cc32f80bd612624dc703cb8af1b" 
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/jinnn11/telco-churn-project.mlflow"

print(f"Testing connection to: {os.environ['MLFLOW_TRACKING_URI']} ...")

try:
    # 1. Set URI
    mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
    
    # 2. Create/Set a dummy experiment
    mlflow.set_experiment("test_connection")
    
    # 3. Try to log something
    with mlflow.start_run(run_name="connection_check"):
        mlflow.log_param("connection_status", "success")
        print("✅ SUCCESS! Connected to DagsHub.")
        print("Please go to your DagsHub repo -> Experiments tab. You should see a run called 'connection_check'.")

except Exception as e:
    print("\n❌ CONNECTION FAILED.")
    print(f"Error Message: {e}")
    print("\nIMPORTANT TROUBLESHOOTING:")
    print("The token '64db...' matches your S3 Key ID. This is usually WRONG for MLflow.")
    print("1. Go back to DagsHub -> Remote button.")
    print("2. Click the 'Experiments' tab (NOT Data/S3).")
    print("3. Copy the 'MLFLOW_TRACKING_PASSWORD' from there. It is usually a longer token.")

Testing connection to: https://dagshub.com/jinnn11/telco-churn-project.mlflow ...


2025/12/18 15:06:59 INFO mlflow.tracking.fluent: Experiment with name 'test_connection' does not exist. Creating a new experiment.


✅ SUCCESS! Connected to DagsHub.
Please go to your DagsHub repo -> Experiments tab. You should see a run called 'connection_check'.
🏃 View run connection_check at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/1/runs/e3152c7633e34f7480cac519c018af39
🧪 View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/1
