In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report

In [None]:
df=pd.read_csv("cleaned_churn.csv")
df.info()

In [None]:
features=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'num_services',"TotalCharges"]
target='Churn'

In [None]:
X=df[features]
y=df[target]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2,random_state=42)

In [None]:
num_features=["tenure","MonthlyCharges","TotalCharges","SeniorCitizen"]

bin_features=["Partner","Dependents","PhoneService","PaperlessBilling"]

cat_features=["gender","MultipleLines","InternetService","OnlineSecurity","DeviceProtection",
              "OnlineBackup","TechSupport","StreamingTV","StreamingMovies","Contract","PaymentMethod"]

preprocessor=ColumnTransformer(
    transformers=[
        ("num",StandardScaler(),num_features),
        ("bin","passthrough",bin_features),
        ("cat",OneHotEncoder(drop="first",handle_unknown="ignore"),cat_features)
    ]
)

In [None]:
models={
    "lr":LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),
    "svc":SVC(
        probability=True,
        class_weight="balanced"
    ),
    "rf":RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42
    ),
    "xgb":XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        eval_metric="auc",
        random_state=42
    )
    }

In [None]:
results={}
best_roc_auc=-float("inf")
best_name=None
best_pipeline=None

print("\nTraining models...\n")

for name,model in models.items():
    print(f"Training {name}...")

    pipeline=Pipeline(steps=[
        ("preprocessor",preprocessor),
        ("model",model)
    ])

    pipeline.fit(X_train,y_train)
    
    prob=pipeline.predict_proba(X_test)[:,1]
    preds=(prob>=0.5).astype(int)
    
    roc=roc_auc_score(y_test,prob)
    f1=f1_score(y_test,preds)
    cr=classification_report(y_test,preds)
    cm=confusion_matrix(y_test,preds)

    results[name]={
        "roc-auc":roc,
        "f1":f1,
        "confusion_matrix":cm
    }
    

    print(f"ROC-AUC score: {roc} | f1-score: {f1:.4f}")

    if roc>best_roc_auc:
        best_roc_auc=roc
        best_name=name
        best_pipeline=pipeline


In [None]:
smote_models={
    "smote_lr":LogisticRegression(
        max_iter=1000
    )}

In [None]:
print("\nTraining smote models...\n")

for name,model in smote_models.items():
    print(f"Training {name}...")

    pipeline=ImbPipeline(steps=[
        ("preprocessor",preprocessor),
        ("smote",SMOTE(random_state=42)),
        ("model",model)
    ])

    pipeline.fit(X_train,y_train)
    
    prob=pipeline.predict_proba(X_test)[:,1]
    preds=(prob>=0.5).astype(int)
    
    roc=roc_auc_score(y_test,prob)
    f1=f1_score(y_test,preds)
    cr=classification_report(y_test,preds)
    cm=confusion_matrix(y_test,preds)

    results[name]={
        "roc-auc":roc,
        "f1":f1,
        "confusion_matrix":cm
    }
    

    print(f"ROC-AUC score: {roc} | f1-score: {f1:.4f}")




In [None]:

print(f"\nBest Model: {best_name}")

In [None]:
import joblib
joblib.dump(best_pipeline,"best_churn_pipeline.pkl")

In [None]:
categorical_values={
    "gender":sorted(df["gender"].unique()),
    "MultipleLines":sorted(df["MultipleLines"].unique()),
    "InternetService":sorted(df["InternetService"].unique()),
    "OnlineSecurity":sorted(df["OnlineSecurity"].unique()),
    "DeviceProtection":sorted(df["DeviceProtection"].unique()),
    "OnlineBackup":sorted(df["OnlineBackup"].unique()),
    "TechSupport":sorted(df["TechSupport"].unique()),
    "StreamingTV":sorted(df["StreamingTV"].unique()),
    "StreamingMovies":sorted(df["StreamingMovies"].unique()),
    "Contract":sorted(df["Contract"].unique()),
    "PaymentMethod":sorted(df["PaymentMethod"].unique())
}

In [None]:
joblib.dump(categorical_values,"categorical_values.pkl")