In [None]:
import pandas as pd

df = pd.read_csv("../data/processed/churn_fe_data.csv")

selected_features = pd.read_csv(
    "../data/processed/selected_features.csv"
)["feature"].tolist()
selected_features

In [None]:
missing = set(selected_features) - set(df.columns)
missing

In [None]:
X = df[selected_features]
y = df["Churn"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)   # fit ONLY on train
X_test_scaled = scaler.transform(X_test)         # transform test


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score


In [None]:
lr = LogisticRegression(max_iter=1000)

lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

lr_grid = GridSearchCV(
    lr,
    lr_params,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)

print("Logistic Regression Best Params:", lr_grid.best_params_)
print("Test AUC:", roc_auc_score(y_test, lr_grid.predict_proba(X_test)[:,1]))


In [None]:
dt = DecisionTreeClassifier(random_state=42)

dt_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

dt_grid = GridSearchCV(
    dt,
    dt_params,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

dt_grid.fit(X_train, y_train)

print("Decision Tree Best Params:", dt_grid.best_params_)
print("Test AUC:", roc_auc_score(y_test, dt_grid.predict_proba(X_test)[:,1]))


In [None]:
rf = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(
    rf,
    rf_params,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

print("Random Forest Best Params:", rf_grid.best_params_)
print("Test AUC:", roc_auc_score(y_test, rf_grid.predict_proba(X_test)[:,1]))


In [None]:
gb = GradientBoostingClassifier(random_state=42)

gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_grid = GridSearchCV(
    gb,
    gb_params,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

gb_grid.fit(X_train, y_train)

print("Gradient Boosting Best Params:", gb_grid.best_params_)
print("Test AUC:", roc_auc_score(y_test, gb_grid.predict_proba(X_test)[:,1]))


In [None]:
results = {
    "Logistic Regression": roc_auc_score(y_test, lr_grid.predict_proba(X_test)[:,1]),
    "Decision Tree": roc_auc_score(y_test, dt_grid.predict_proba(X_test)[:,1]),
    "Random Forest": roc_auc_score(y_test, rf_grid.predict_proba(X_test)[:,1]),
    "Gradient Boosting": roc_auc_score(y_test, gb_grid.predict_proba(X_test)[:,1])
}

for model, auc in results.items():
    print(model, "AUC:", round(auc, 3))


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)

In [None]:
parameters = {
    "penalty": ["l1", "l2"],
    "C": [0.01, 0.1, 1, 10]
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=lr,
    param_grid=parameters,
    scoring="roc_auc",     # BEST default for churn
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)


In [None]:
print("Best Parameters" , grid.best_params_)
print("Best Score" ,grid.best_score_)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

best_lr = grid.best_estimator_

y_pred = best_lr.predict(X_test_scaled)
y_prob = best_lr.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

## New Data Prediction

In [None]:
new_customer = {
    'early_customer': 1,
    'SeniorCitizen': 1,
    'TechSupport_Yes': 0,
    'DeviceProtection_Yes': 0,
    'StreamingTV_Yes': 0,
    'StreamingMovies_Yes': 0,
    'tenure': 1,
    'contract_tenure_risk': 1,
    'MonthlyCharges': 99.1,
    'Dependents': 0,
    'OnlineBackup_Yes': 0,
    'PaymentMethod_Credit card (automatic)': 0,
    'InternetService_No': 0,
    'PaperlessBilling': 1,
    'contract_risk': 1,
    'OnlineSecurity_Yes': 0,
    'Partner': 0,
    'MultipleLines_Yes': 0,
    'InternetService_Fiber optic': 1,
    'PhoneService': 1,
    'PaymentMethod_Electronic check': 1,
    'service_count': 1,
    'PaymentMethod_Mailed check': 0
}

In [None]:
import pandas as pd

import pandas as pd

new_df = pd.DataFrame([new_customer])
new_df = new_df[selected_features]
new_df_scaled = pd.DataFrame(
    scaler.transform(new_df),
    columns=selected_features
)

prediction = grid.predict(new_df_scaled)
probability = grid.predict_proba(new_df_scaled)[0][1]


print("Prediction:", prediction[0])
print("Churn Probability:", round(probability, 3))


In [None]:
churn_pred = prediction
churn_prob = probability

if churn_pred == 1:
    print(f"⚠️ Customer is likely to CHURN (probability = {churn_prob:.2f})")
else:
    print(f"✅ Customer is likely to STAY (probability = {churn_prob:.2f})")


## Serialization

In [None]:
import joblib

joblib.dump(grid, "churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(selected_features, "final_features.pkl")
