In [16]:
import warnings

import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import(
    classification_report, accuracy_score, roc_auc_score
)


In [17]:
warnings.filterwarnings("ignore")

In [18]:
def load_and_explore_dataset(filepath):
    print("=" * 60)
    print("LOAD AND  EXPLORE DATASET")
    print("=" * 60)

    df = pd.read_csv(filepath)
    print("Shape of the dataset")
    print(df.shape)
    print("\nCheck for missing values")
    print(df.isnull().sum())
    print("\nFirst five rows:")
    print(df.head())
    print("\nDescriptive stats")
    print(df.describe())
    print("\nDataset Info")
    print(df.info())
    print("\nExited Distribution:")
    print(df["Exited"].value_counts())
    print("\nExited percebtage Distribution:")
    print(df["Exited"].value_counts(normalize=True) * 100)

    return df


In [19]:
def identify_features(df):
    print("=" * 60)
    print("IDENTIFY FEATURE DATASET")
    print("=" * 60)


    # features to be droped(not useful)
    feature_to_drop = [
        "CustomerId",
        "Surname"
    ]

    numerical_features = [
        "CreditScore",
        "Age",
        "Tenure",
        "EstimatedSalary",
        "Balance",
        "NumOfProducts",
        "ServiceRating"
    ]

    categorical_features = [
        "Geography",
        "Gender",
        "HasCrCard",
        "IsActiveMember"
    ]
    print("\nFeatures to drop :",feature_to_drop)
    print("\nNumerical features :", numerical_features )
    print("Categirical features :", categorical_features)

    return(numerical_features, categorical_features, feature_to_drop )

In [20]:
def prepare_data(df, numerical_features, categorical_features, feature_to_drop):
    print("=" * 60)
    print("PREPARE DATASET")
    print("=" * 60)

    # drop unnecessary columns
    df_model = df.drop(columns=feature_to_drop)

    # feature to use
    X = df_model.drop("Exited", axis=1)
    # target
    y = df_model["Exited"]

    print("\nFeature shape:", X.shape)
    print("\nTarget shape :", y.shape)
    print("\nFeature columns list:", list(X.columns))

    return X, y, list(X.columns)

In [21]:
def create_preprossesing_pipeline(numerical_features, categorical_features):
    print("=" * 60)
    print("CREATING PREPROCESSNG PIPELINE")
    print("=" * 60)

    # numerical features pipeline
    numerical_pipeline = Pipeline([
        ("scaler", StandardScaler())
    ])

    # categorical features pipeline
    categorical_pipeline = Pipeline([
        ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ])

    # column preprocesing
    preprocessor = ColumnTransformer([
        ("num", numerical_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features)
    ])

    return preprocessor

In [22]:
def split_data(X, y, test_size=0.2, random_state=42):
    print("\n" + "=" * 60)
    print("SPLITING DATA INTO TEST/TRAIN")
    print("=" * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print("\nTraining set size", X_train.shape[0])
    print("Testing set size", X_test.shape[0])
    print("Train churn distribution", y_train.value_counts())
    print("Test churn distribution", y_test.value_counts())
    
    return X_train, X_test, y_train, y_test

In [23]:
def create_model_pipeline(preprocessor):
    print("\n" + "=" * 60)
    print("CREATING MODEL PIPELINE")
    print("=" * 60)

    model_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(
            max_iter= 1000,
            random_state=42,
            class_weight="balanced",
            solver="lbfgs"
        ))
    ])

    print("\nModel pipeline created successfully")

    return model_pipeline

In [24]:
def train_model(model_pipeline, X_train, y_train):
    print("\n" + "=" * 60)
    print("TRAINING MODEL")
    print("=" * 60)

    model_pipeline.fit(X_train, y_train)

    print("\nModel train successfully")

    #get name steps preprocessing
    try:
        num_features = model_pipeline.named_steps["preprocessor"].transformers_[0][2]
        cat_features = model_pipeline.named_steps["preprocessor"].transformers_[1][2]

        # get name stepd onehotencoder
        onehot_encoder = model_pipeline.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
        cat_features_names = onehot_encoder.get_feature_names_out(cat_features)

        all_features_name = list(num_features) + list(cat_features_names)

        # get coefficient
        coefficient = model_pipeline.named_steps["classifier"].coef_[0]
        print("\nTop 10 most important features (by coefficient magnitude)")
        feature_importance = pd.DataFrame({
            "feature": all_features_name,
            "coefficient": coefficient
        }).sort_values("coefficient", key=abs, ascending=False)
        print(feature_importance.head(10).to_string(index=False))
    except Exception as e:
        print(f"Can not extract  features names {e}")

    return model_pipeline

In [25]:
def evaluate_model(model_pipeline, X_train, X_test, y_train, y_test):
    print("\n" + "=" * 60)
    print("EVALUATING MODEL")
    print("=" * 60)

    #predicted model
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # predicted model prob
    y_train_proba = model_pipeline.predict_proba(X_train)[:, 1]
    y_test_proba = model_pipeline.predict_proba(X_test)[:, 1]
    
    # metrics
    y_train_acc_score = accuracy_score(y_train, y_train_pred)
    y_test_acc_score = accuracy_score(y_test, y_test_pred)

    y_train_roc_score = roc_auc_score(y_train, y_train_proba)
    y_test_roc_score = roc_auc_score(y_test, y_test_proba)

    print("\nMETRICS EVALUATION")
    print(f"\nTrain accuracy score:, {float(y_train_acc_score):.4f}")
    print(f"\nTest accuracy score:, {float(y_test_acc_score):.4f}")
    print(f"\nTrain ROC score:, {float(y_train_roc_score):.4f}")
    print(f"\nTest ROC score:, {float(y_test_roc_score):.4f}")

    print("\nCLASSIFICATION OF TRAIN SIZE")
    print("\nclassificatin report\n", classification_report(y_train, y_train_pred, target_names=["Retained", "Churned"]))

    print("\nCLASSIFICATION OF TRAIN SIZE")
    print("\nclassificatin report\n", classification_report(y_test, y_test_pred, target_names=["Retained", "Churned"]))

    # cross validation
    cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring="roc_auc")
    print("\nCross Validation (5-folds)")
    print(f" R2 scores: {cv_scores}")
    print(f" Cross validation mean: {cv_scores.mean():.4f}")
    print(f" Cross validation STD: {cv_scores.std():.4f}")

    metrics = {
        "y_train_acc_score":  y_train_acc_score,
        "y_test_acc_score": y_test_acc_score,
        "y_train_roc_score": y_train_roc_score,
        "y_test_roc_score": y_test_roc_score,
        "y_train_pred":  y_train_pred,
        "y_test_pred": y_test_pred,
        "y_train_proba": y_train_proba,
        "y_test_proba": y_test_proba,
        "cv_scores": cv_scores 
    }

In [26]:
def save_model_artifact(model_pipeline, feature_columns):
    print("\n" + "=" * 60)
    print("SAVING MODEL")
    print("=" * 60)
    joblib.dump(model_pipeline, "churned_model_prediction.pkl")
    print("Churned prediction model saved successfully")

    joblib.dump(feature_columns, "churn_feature_columns.pkl")
    print("Feature columns saved successfully")


    print("\n" + "=" * 60)
    print("ALL MODEL ARTIFACT SAVED SUCCESSFULLY")
    print("=" * 60)

In [27]:
def predict_churn(customer_data):
    model_pipeline = joblib.load("churned_model_prediction.pkl")
    feature_columns = joblib.load("churn_feature_columns.pkl")

    customer_df = pd.DataFrame([customer_data])

    for feature in feature_columns:
        if feature not in customer_df.columns:
            raise ValueError(f"Missing required feature {feature}")

    customer_df = customer_df[feature_columns]

    prediction = model_pipeline.predict(customer_df)[0]
    probability = model_pipeline.predict_proba(customer_df)[0]

    result = {
        "prediction": "Churned" if prediction == 1 else "Retained",
        "churn_probability": probability[1],
        "retained_probability": probability[0],
        "risk_level": "High" if probability[1] > 0.7 else "Medium" if probability[0] > 0.4 else "Low"
    } 

    return result

In [28]:
def test_predict():
    customer_data = {
        "CreditScore": 400,
        "Geography": "Germany",
        "Gender": "Female",
        "Age": 45,
        "Tenure": 2,
        "Balance": 100_000.0,
        "NumOfProducts": 1,
        "HasCrCard": "Yes",
        "IsActiveMember": "No",
        "EstimatedSalary": 80_000.0,  
        "ServiceRating": 1
    }

    result = predict_churn(customer_data)
    print("Prediction :", result ["prediction"])
    print("Churned Probability :", result ["churn_probability"])
    print("Retained Probability :", result ["retained_probability"])
    print("Risk Level :", result ["risk_level"])

In [29]:
def main():
    filepath = "cleaned_bank_cusomer_churn.csv"
    
    df = load_and_explore_dataset(filepath)

    numerical_features, categorical_features, feature_to_drop = identify_features(df)

    X, y, feature_columns = prepare_data(df, numerical_features, categorical_features, feature_to_drop)

    preprocessor = create_preprossesing_pipeline(numerical_features, categorical_features)

    X_train, X_test, y_train, y_test = split_data(X, y)

    model_pipeline = create_model_pipeline(preprocessor)

    model_pipeline = train_model(model_pipeline, X_train, y_train)

    metrics = evaluate_model(model_pipeline, X_train, X_test, y_train, y_test)

    save_model_artifact(model_pipeline, feature_columns)

    test_predict()

if __name__ == "__main__":
    main()

LOAD AND  EXPLORE DATASET
Shape of the dataset
(9997, 14)

Check for missing values
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
EstimatedSalary    0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
Exited             0
ServiceRating      0
dtype: int64

First five rows:
   CustomerId   Surname  CreditScore Geography  Gender   Age  Tenure  \
0    15634602  Hargrave          619    France  Female  42.0       2   
1    15647311      Hill          608     Spain  Female  41.0       1   
2    15619304      Onio          502    France  Female  42.0       8   
3    15701354      Boni          699    France  Female  39.0       1   
4    15737888  Mitchell          850     Spain  Female  43.0       2   

   EstimatedSalary    Balance  NumOfProducts HasCrCard IsActiveMember  Exited  \
0        101348.88       0.00              1       Yes            Yes       1