In [9]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# 1. SETUP CONNECTION
load_dotenv(dotenv_path="../.env") # Adjust path if needed
RENDER_URL = "postgresql://eas503_final_project_user:YTTEtGb7piQ9eBFoyxwesnWWhFYyq9Kc@dpg-d5256sv5r7bs739ugro0-a.ohio-postgres.render.com/eas503_final_project"
engine = create_engine(RENDER_URL)

# 2. SQL JOIN QUERY
# Joins Customers, Services, and Contracts tables
query = """
SELECT 
    c.*, 
    s.phoneservice, s.multiplelines, s.internetservice, 
    s.onlinesecurity, s.onlinebackup, s.deviceprotection, 
    s.techsupport, s.streamingtv, s.streamingmovies,
    k.tenure, k.contract, k.paperlessbilling, 
    k.paymentmethod, k.monthlycharges, k.totalcharges, k.churn
FROM customers AS c
JOIN services AS s ON c.customerid = s.customerid
JOIN contracts AS k ON c.customerid = k.customerid
ORDER BY c.customerid
"""

# 3. LOAD DATA
df = pd.read_sql(query, engine)

# 4. PRE-PROCESSING
# Map Churn to 1/0
df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})

print(f"Data loaded. Shape: {df.shape}")
print(f"Churn distribution:\n{df['churn'].value_counts()}")
df.head()

Data loaded. Shape: (7043, 21)
Churn distribution:
churn
0    5174
1    1869
Name: count, dtype: int64


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,...,techsupport,streamingtv,streamingmovies,tenure,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0002-ORFBO,Female,0,Yes,Yes,Yes,No,DSL,No,Yes,...,Yes,Yes,No,9,One year,Yes,Mailed check,65.6,593.3,0
1,0003-MKNFE,Male,0,No,No,Yes,Yes,DSL,No,No,...,No,No,Yes,9,Month-to-month,No,Mailed check,59.9,542.4,0
2,0004-TLHLJ,Male,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,4,Month-to-month,Yes,Electronic check,73.9,280.85,1
3,0011-IGKFF,Male,1,Yes,No,Yes,No,Fiber optic,No,Yes,...,No,Yes,Yes,13,Month-to-month,Yes,Electronic check,98.0,1237.85,1
4,0013-EXCHZ,Female,1,Yes,No,Yes,No,Fiber optic,No,No,...,Yes,Yes,No,3,Month-to-month,Yes,Mailed check,83.9,267.4,1


In [11]:
os.environ['MLFLOW_TRACKING_INSECURE_TLS'] = 'true'
os.environ['CURL_CA_BUNDLE'] = ''

In [12]:
import mlflow
import os

# --- CONFIGURE CREDENTIALS ---
# I am using the values from your screenshot and the token you pasted
os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("MLFLOW_TRACKING_USERNAME")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_TRACKING_PASSWORD")
os.environ["MLFLOW_TRACKING_URI"] = os.getenv("MLFLOW_TRACKING_URI")

print(f"Testing connection to: {os.environ['MLFLOW_TRACKING_URI']} ...")

try:
    # 1. Set URI
    mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
    
    # 2. Create/Set a dummy experiment
    mlflow.set_experiment("test_connection")
    
    # 3. Try to log something
    with mlflow.start_run(run_name="connection_check"):
        mlflow.log_param("connection_status", "success")
        print("‚úÖ SUCCESS! Connected to DagsHub.")
        print("Please go to your DagsHub repo -> Experiments tab. You should see a run called 'connection_check'.")

except Exception as e:
    print("\n‚ùå CONNECTION FAILED.")
    print(f"Error Message: {e}")
    print("\nIMPORTANT TROUBLESHOOTING:")
    print("The token '64db...' matches your S3 Key ID. This is usually WRONG for MLflow.")
    print("1. Go back to DagsHub -> Remote button.")
    print("2. Click the 'Experiments' tab (NOT Data/S3).")
    print("3. Copy the 'MLFLOW_TRACKING_PASSWORD' from there. It is usually a longer token.")

Testing connection to: https://dagshub.com/jinnn11/telco-churn-project.mlflow ...




‚úÖ SUCCESS! Connected to DagsHub.
Please go to your DagsHub repo -> Experiments tab. You should see a run called 'connection_check'.




üèÉ View run connection_check at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/1/runs/7a5b64ddd54c49f58f32c49aa490e2de
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/1




In [None]:
# =============================================================================
# NOTEBOOK 03: FULL OPTUNA PIPELINE (TELCO CHURN)
# =============================================================================

import time
import os
import joblib
import optuna
import mlflow
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from optuna.samplers import TPESampler
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

# Import Classifiers
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sqlalchemy import create_engine
import sys
import importlib

# -----------------------------------------------------------------------------
# 1. SSL FIX & ENVIRONMENT SETUP
# -----------------------------------------------------------------------------
# Fixes the "SSLCertVerificationError" for DagsHub
os.environ['MLFLOW_TRACKING_INSECURE_TLS'] = 'true'
os.environ['CURL_CA_BUNDLE'] = ''

load_dotenv(dotenv_path="../.env", override=True)

# -----------------------------------------------------------------------------
# 2. PIPELINE RELOAD (CRITICAL FIX)
# -----------------------------------------------------------------------------
# Point to root folder to find housing_pipeline.py
sys.path.append(os.path.abspath(os.path.join('..')))
import housing_pipeline

importlib.reload(housing_pipeline)
from housing_pipeline import build_preprocessing

print("‚úì Pipeline reloaded successfully.")

# -----------------------------------------------------------------------------
# 3. DATA LOADING (FROM RENDER)
# -----------------------------------------------------------------------------
RENDER_URL = os.getenv("DB_URL")
engine = create_engine(RENDER_URL)

query = """
SELECT 
    c.*, 
    s.phoneservice, s.multiplelines, s.internetservice, 
    s.onlinesecurity, s.onlinebackup, s.deviceprotection, 
    s.techsupport, s.streamingtv, s.streamingmovies,
    k.tenure, k.contract, k.paperlessbilling, 
    k.paymentmethod, k.monthlycharges, k.totalcharges, k.churn
FROM customers AS c
JOIN services AS s ON c.customerid = s.customerid
JOIN contracts AS k ON c.customerid = k.customerid
"""

df = pd.read_sql(query, engine)

# Map Churn to 1/0
df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})
# Drop ID
df_clean = df.drop(columns=["customerid"], errors='ignore')

X = df_clean.drop("churn", axis=1)
y = df_clean["churn"]

# Stratified Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print(f"‚úì Data Loaded. Train Size: {len(X_train)}")

# -----------------------------------------------------------------------------
# 4. MLFLOW CONFIG
# -----------------------------------------------------------------------------
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("telco_churn_optuna_experiments")
optuna.logging.set_verbosity(optuna.logging.WARNING)

# -----------------------------------------------------------------------------
# 5. DEFINE OPTUNA OBJECTIVES (MAXIMIZE F1)
# -----------------------------------------------------------------------------
# We add 'scale_pos_weight' to tree models to handle class imbalance

def objective_ridge(trial, preprocessing, X_train, y_train, use_pca=False):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    model = RidgeClassifier(alpha=alpha, random_state=42)
    if use_pca:
        n = trial.suggest_float("pca__n_components", 0.8, 0.99)
        pipeline = make_pipeline(clone(preprocessing), PCA(n_components=n), model)
    else:
        pipeline = make_pipeline(clone(preprocessing), model)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1", n_jobs=-1).mean()

def objective_hgb(trial, preprocessing, X_train, y_train, use_pca=False):
    lr = trial.suggest_float("hgb__learning_rate", 0.01, 0.2)
    depth = trial.suggest_int("hgb__max_depth", 3, 10)
    model = HistGradientBoostingClassifier(learning_rate=lr, max_depth=depth, random_state=42)
    if use_pca:
        n = trial.suggest_float("pca__n_components", 0.8, 0.99)
        pipeline = make_pipeline(clone(preprocessing), PCA(n_components=n), model)
    else:
        pipeline = make_pipeline(clone(preprocessing), model)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1", n_jobs=-1).mean()

def objective_xgb(trial, preprocessing, X_train, y_train, use_pca=False):
    lr = trial.suggest_float("xgb__learning_rate", 0.01, 0.3)
    depth = trial.suggest_int("xgb__max_depth", 3, 10)
    weight = trial.suggest_float("xgb__scale_pos_weight", 1, 4)
    model = XGBClassifier(learning_rate=lr, max_depth=depth, scale_pos_weight=weight, 
                          objective="binary:logistic", random_state=42, n_jobs=-1)
    if use_pca:
        n = trial.suggest_float("pca__n_components", 0.8, 0.99)
        pipeline = make_pipeline(clone(preprocessing), PCA(n_components=n), model)
    else:
        pipeline = make_pipeline(clone(preprocessing), model)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1", n_jobs=-1).mean()

def objective_lgbm(trial, preprocessing, X_train, y_train, use_pca=False):
    lr = trial.suggest_float("lgbm__learning_rate", 0.01, 0.3)
    leaves = trial.suggest_int("lgbm__num_leaves", 20, 100)
    weight = trial.suggest_float("lgbm__scale_pos_weight", 1, 4)
    model = LGBMClassifier(learning_rate=lr, num_leaves=leaves, scale_pos_weight=weight,
                           random_state=42, n_jobs=-1, verbosity=-1)
    if use_pca:
        n = trial.suggest_float("pca__n_components", 0.8, 0.99)
        pipeline = make_pipeline(clone(preprocessing), PCA(n_components=n), model)
    else:
        pipeline = make_pipeline(clone(preprocessing), model)
    return cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1", n_jobs=-1).mean()

# -----------------------------------------------------------------------------
# 6. RUN OPTIMIZATION LOOPS
# -----------------------------------------------------------------------------
model_funcs = {
    "ridge": objective_ridge, "histgradientboosting": objective_hgb, 
    "xgboost": objective_xgb, "lightgbm": objective_lgbm
}

# Create preprocessing object once
preprocessing = build_preprocessing()
all_results = {}

for use_pca in [False, True]:
    pca_tag = "with_PCA" if use_pca else "no_PCA"
    
    for name, func in model_funcs.items():
        run_name = f"{name}_{pca_tag}_optuna"
        print(f"\n--- Optimizing {run_name} ---")
        
        # 1. OPTIMIZE (Maximize F1)
        study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
        study.optimize(lambda t: func(t, preprocessing, X_train, y_train, use_pca=use_pca), n_trials=10)
        
        # 2. REBUILD BEST PIPELINE
        p = study.best_params
        
        if name == "ridge":
            model = RidgeClassifier(alpha=p["ridge__alpha"], random_state=42)
        elif name == "histgradientboosting":
            model = HistGradientBoostingClassifier(learning_rate=p["hgb__learning_rate"], max_depth=p["hgb__max_depth"], random_state=42)
        elif name == "xgboost":
            model = XGBClassifier(learning_rate=p["xgb__learning_rate"], max_depth=p["xgb__max_depth"], scale_pos_weight=p["xgb__scale_pos_weight"], random_state=42)
        elif name == "lightgbm":
            model = LGBMClassifier(learning_rate=p["lgbm__learning_rate"], num_leaves=p["lgbm__num_leaves"], scale_pos_weight=p["lgbm__scale_pos_weight"], random_state=42, verbosity=-1)

        if use_pca:
            final_pipe = make_pipeline(clone(preprocessing), PCA(n_components=p["pca__n_components"]), model)
        else:
            final_pipe = make_pipeline(clone(preprocessing), model)
            
        final_pipe.fit(X_train, y_train)
        test_f1 = f1_score(y_test, final_pipe.predict(X_test))
        
        print(f"DONE: {run_name} | Best CV F1: {study.best_value:.4f} | Test F1: {test_f1:.4f}")
        
        all_results[run_name] = {"pipeline": final_pipe, "test_f1": test_f1}
        
        # 3. LOG TO MLFLOW
        with mlflow.start_run(run_name=run_name):
            mlflow.log_params(p)
            mlflow.log_metric("cv_f1", study.best_value)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.log_param("uses_pca", use_pca)
            mlflow.sklearn.log_model(final_pipe, "model")

# -----------------------------------------------------------------------------
# 7. SAVE GLOBAL BEST
# -----------------------------------------------------------------------------
if all_results:
    # Select max F1 score
    global_best_key = max(all_results, key=lambda k: all_results[k]["test_f1"])
    global_best_pipeline = all_results[global_best_key]["pipeline"]

    print("\n" + "="*50)
    print(f"PROJECT GLOBAL BEST: {global_best_key}")
    print(f"Final Test F1: {all_results[global_best_key]['test_f1']:.4f}")
    print("="*50)

    # Save to project root models folder
    os.makedirs("../models", exist_ok=True)
    joblib.dump(global_best_pipeline, "../models/global_best_model_optuna.pkl")
    print(f"‚úì Model saved to ../models/global_best_model_optuna.pkl")

‚úì Pipeline reloaded successfully.
‚úì Data Loaded. Train Size: 5634

--- Optimizing ridge_no_PCA_optuna ---




DONE: ridge_no_PCA_optuna | Best CV F1: 0.5789 | Test F1: 0.5635




üèÉ View run ridge_no_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/c28efbf29b4a44e0a4374a59c1a795f0
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3

--- Optimizing histgradientboosting_no_PCA_optuna ---




DONE: histgradientboosting_no_PCA_optuna | Best CV F1: 0.5953 | Test F1: 0.5599




üèÉ View run histgradientboosting_no_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/b3a73fb930904f91ade17d4af1238f5b
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3

--- Optimizing xgboost_no_PCA_optuna ---




DONE: xgboost_no_PCA_optuna | Best CV F1: 0.6345 | Test F1: 0.6265




üèÉ View run xgboost_no_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/6dffe5f274e3491d9e16f210ca2311f2
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3





--- Optimizing lightgbm_no_PCA_optuna ---




DONE: lightgbm_no_PCA_optuna | Best CV F1: 0.6218 | Test F1: 0.6217




üèÉ View run lightgbm_no_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/1281cc21e16e430c816acb7dea0be366
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3

--- Optimizing ridge_with_PCA_optuna ---




DONE: ridge_with_PCA_optuna | Best CV F1: 0.5911 | Test F1: 0.5753




üèÉ View run ridge_with_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/4ae520dce1db4f6c84b261c3d3362118
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3





--- Optimizing histgradientboosting_with_PCA_optuna ---
DONE: histgradientboosting_with_PCA_optuna | Best CV F1: 0.5768 | Test F1: 0.5858




üèÉ View run histgradientboosting_with_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/26244380151b4b99b886236baf5cd574
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3





--- Optimizing xgboost_with_PCA_optuna ---
DONE: xgboost_with_PCA_optuna | Best CV F1: 0.6206 | Test F1: 0.6167




üèÉ View run xgboost_with_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/a95c9cd11f21431a97f829b880a27163
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3





--- Optimizing lightgbm_with_PCA_optuna ---




DONE: lightgbm_with_PCA_optuna | Best CV F1: 0.6046 | Test F1: 0.6149




üèÉ View run lightgbm_with_PCA_optuna at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3/runs/4159eb1d25bb4408821111b85db36000
üß™ View experiment at: https://dagshub.com/jinnn11/telco-churn-project.mlflow/#/experiments/3

PROJECT GLOBAL BEST: xgboost_no_PCA_optuna
Final Test F1: 0.6265
‚úì Model saved to ../models/global_best_model_optuna.pkl


