In [44]:
# Import libraries
import sqlite3
import pandas as pd

from sklearn.model_selection import train_test_split

from src.preprocessor import build_preprocessor

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, classification_report, precision_recall_curve

import plotly.express as px

import joblib

### Read in the data from SQL view

In [45]:
conn = sqlite3.connect("../data/telco_churn.db")
df = pd.read_sql_query("SELECT * FROM vw_churn_training_dataset", conn)
conn.close()


### Data Preprocessing

In [46]:
TARGET = "churn_target"
DROP_COLS = ["customer_id", "snapshot_date"]

X = df.drop(columns=DROP_COLS + [TARGET])
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

preprocessor = build_preprocessor(X_train)

X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)

print(X_train_transformed.shape)
print(X_test_transformed.shape)


(5634, 54)
(1409, 54)


#### Logistic Regression

In [31]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

log_reg.fit(X_train_transformed, y_train)

y_pred_lr = log_reg.predict(X_test_transformed)
y_proba_lr = log_reg.predict_proba(X_test_transformed)[:, 1]

print("Logistic Regression AUC:", roc_auc_score(y_test, y_proba_lr))
print(classification_report(y_test, y_pred_lr))


'n_jobs' has no effect since 1.8 and will be removed in 1.10. You provided 'n_jobs=-1', please leave it unspecified.



Logistic Regression AUC: 0.8497016197783461
              precision    recall  f1-score   support

           0       0.90      0.74      0.81      1035
           1       0.52      0.79      0.62       374

    accuracy                           0.75      1409
   macro avg       0.71      0.76      0.72      1409
weighted avg       0.80      0.75      0.76      1409




lbfgs failed to converge after 1000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



#### RandomForest

In [47]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample"
)

rf.fit(X_train_transformed, y_train)

y_pred_rf = rf.predict(X_test_transformed)
y_proba_rf = rf.predict_proba(X_test_transformed)[:, 1]

print("Random Forest AUC:", roc_auc_score(y_test, y_proba_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest AUC: 0.8547379162468675
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1035
           1       0.54      0.79      0.64       374

    accuracy                           0.77      1409
   macro avg       0.73      0.77      0.74      1409
weighted avg       0.81      0.77      0.78      1409



#### XGBoost

In [33]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train_transformed, y_train)

y_pred_xgb = xgb.predict(X_test_transformed)
y_proba_xgb = xgb.predict_proba(X_test_transformed)[:, 1]

print("XGBoost AUC:", roc_auc_score(y_test, y_proba_xgb))
print(classification_report(y_test, y_pred_xgb))

XGBoost AUC: 0.8520357022914568
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.65      0.53      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409



In [48]:
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_rf)

pr_df = pd.DataFrame({
    "threshold": thresholds,
    "precision": precision[:-1],
    "recall": recall[:-1]
})

px.line(
    pr_df,
    x="threshold",
    y=["precision", "recall"],
    title="Precision–Recall vs Threshold (Random Forest)"
)

In [49]:
pr_df.sort_values(
    by="recall",
    ascending=False
).head()

Unnamed: 0,threshold,precision,recall
0,0.003459,0.265436,1.0
1,0.003948,0.265625,1.0
2,0.004368,0.265814,1.0
3,0.004431,0.266003,1.0
4,0.004708,0.266192,1.0


The retention policy says 

        We want to catch at least 80% of churners, but avoid excessive false positives.

In [50]:
pr_df.sort_values(
    by="threshold",
    ascending=False
).head()

Unnamed: 0,threshold,precision,recall
1407,0.947589,1.0,0.002674
1406,0.946927,1.0,0.005348
1405,0.939302,1.0,0.008021
1404,0.938074,1.0,0.010695
1403,0.928104,0.8,0.010695


In [51]:
# pr_df already has: threshold, precision, recall
def best_threshold_by_recall_constraint(pr_df, min_recall=0.80):
    candidates = pr_df[pr_df["recall"] >= min_recall].copy()
    candidates = candidates.sort_values(["precision", "recall"], ascending=[False, False])
    return candidates.head(10)

In [52]:
best_threshold_by_recall_constraint(pr_df, min_recall=0.80)

Unnamed: 0,threshold,precision,recall
844,0.484533,0.530973,0.802139
843,0.480916,0.530035,0.802139
842,0.480786,0.529101,0.802139
838,0.479091,0.528897,0.807487
841,0.480327,0.528169,0.802139
839,0.479233,0.52807,0.804813
837,0.47802,0.527972,0.807487
840,0.479646,0.527241,0.802139
836,0.477783,0.527051,0.807487
835,0.477039,0.526132,0.807487


In [53]:
best_threshold_by_recall_constraint(pr_df, min_recall=0.85)


Unnamed: 0,threshold,precision,recall
792,0.444218,0.515397,0.850267
791,0.443998,0.514563,0.850267
790,0.443773,0.513732,0.850267
789,0.441856,0.512903,0.850267
788,0.441637,0.512077,0.850267
787,0.441241,0.511254,0.850267
786,0.441228,0.510433,0.850267
785,0.440932,0.509615,0.850267
784,0.440536,0.5088,0.850267
783,0.438121,0.507987,0.850267


Selecting a threshold around 0.48 to maintain ~80% recall while maximizing precision under that constraint.

In [54]:
DEFAULT_THRESHOLD = 0.48
AGGRESSIVE_THRESHOLD = 0.28

def predict_churn(proba, mode="default"):
    if mode == "aggressive":
        return (proba >= AGGRESSIVE_THRESHOLD).astype(int)
    return (proba >= DEFAULT_THRESHOLD).astype(int)


In [55]:
joblib.dump(preprocessor, "../artifacts/preprocessor.joblib")
joblib.dump(rf, "../artifacts/churn_model_rf.joblib")

['../artifacts/churn_model_rf.joblib']