In [26]:
import warnings

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [27]:
warnings.filterwarnings("ignore")

In [28]:
def load_and_explore_dataset(filepath):
    print("=" * 60)
    print("LOAD AND  EXPLORE DATASET")
    print("=" * 60)

    df = pd.read_csv(filepath)
    print("Shape of the dataset")
    print(df.shape)
    print("\nCheck for missing values")
    print(df.isnull().sum())
    print("\nFirst five rows:")
    print(df.head())
    print("\nDescriptive stats")
    print(df.describe())
    print("\nDataset Info")
    print(df.info())
    print("\nExited Distribution:")
    print(df["Exited"].value_counts())
    print("\nExited percebtage Distribution:")
    print(df["Exited"].value_counts(normalize=True) * 100)

    return df


In [29]:
def identify_features(df):
    print("=" * 60)
    print("IDENTIFY FEATURE DATASET")
    print("=" * 60)


    # features to be droped(not useful)
    feature_to_drop = [
        "CustomerId",
        "Surname"
    ]

    numerical_features = [
        "CreditScore",
        "Age",
        "Tenure",
        "EstimatedSalary",
        "Balance",
        "NumOfProducts",
        "ServiceRating"
    ]

    categorical_features = [
        "Geography",
        "Gender",
        "HasCrCard",
        "IsActiveMember"
    ]
    print("\nFeatures to drop :",feature_to_drop)
    print("\nNumerical features :", numerical_features )
    print("Categirical features :", categorical_features)

    return(numerical_features, categorical_features, feature_to_drop )

In [30]:
def prepare_data(df, numerical_features, categorical_features, feature_to_drop):
    print("=" * 60)
    print("PREPARE DATASET")
    print("=" * 60)

    # drop unnecessary columns
    df_model = df.drop(columns=feature_to_drop)

    # feature to use
    X = df_model.drop("Exited", axis=1)
    # target
    y = df_model["Exited"]

    print("\nFeature shape:", X.shape)
    print("\nTarget shape :", y.shape)
    print("\nFeature columns list:", list(X.columns))

    return X, y, list(X.columns)

In [31]:
def create_preprossesing_pipeline(numerical_features, categorical_features):
    print("=" * 60)
    print("CREATING PREPROCESSNG PIPELINE")
    print("=" * 60)

    # numerical features pipeline
    numerical_pipeline = Pipeline([
        ("scaler", StandardScaler())
    ])

    # categorical features pipeline
    categorical_pipeline = Pipeline([
        ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ])

    # column preprocesing
    preprocessor = ColumnTransformer([
        ("num", numerical_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features)
    ])

    return preprocessor

In [32]:
def split_data(X, y, test_size=0.2, random_state=42):
    print("\n" + "=" * 60)
    print("SPLITING DATA INTO TEST/TRAIN")
    print("=" * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print("\nTraining set size", X_train.shape[0])
    print("Testing set size", X_test.shape[0])
    print("Train churn distribution", y_train.value_counts())
    print("Test churn distribution", y_test.value_counts())
    
    return X_train, X_test, y_train, y_test

In [33]:
def create_model_pipeline(preprocessor):
    print("\n" + "=" * 60)
    print("CREATING MODEL PIPELINE")
    print("=" * 60)

    model_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(
            max_iter= 1000,
            random_state=42,
            class_weight="balanced",
            solver="lbfgs"
        ))
    ])

    print("\nModel pipeline created successfully")

    return model_pipeline

In [45]:
def train_model(model_pipeline, X_train, y_train):
    print("\n" + "=" * 60)
    print("TRAINING MODEL")
    print("=" * 60)

    model_pipeline.fit(X_train, y_train)

    print("\nModel train successfully")

    #get name steps preprocessing
    try:
        num_features = model_pipeline.named_steps["preprocessor"].transformers_[0][2]
        cat_features = model_pipeline.named_steps["preprocessor"].transformers_[1][2]

        # get name stepd onehotencoder
        onehot_encoder = model_pipeline.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
        cat_features_names = onehot_encoder.get_feature_names_out(cat_features)

        all_features_name = list(num_features) + list(cat_features_names)

        # get coefficient
        coefficient = model_pipeline.named_steps["classifier"].coef_[0]
        print("\nTop 10 most important features (by coefficient magnitude)")
        feature_importance = pd.DataFrame({
            "feature": all_features_name,
            "coefficient": coefficient
        }).sort_values("coefficient", key=abs, ascending=False)
        print(feature_importance.head(10).to_string(index=False))
    except Exception as e:
        print(f"Can not extract  features names {e}")

    return model_pipeline

In [46]:
def main():
    filepath = "cleaned_bank_cusomer_churn.csv"
    
    df = load_and_explore_dataset(filepath)

    numerical_features, categorical_features, feature_to_drop = identify_features(df)

    X, y, feature_columns = prepare_data(df, numerical_features, categorical_features, feature_to_drop)

    preprocessor = create_preprossesing_pipeline(numerical_features, categorical_features)

    X_train, X_test, y_train, y_test = split_data(X, y)

    model_pipeline = create_model_pipeline(preprocessor)

    model_pipeline = train_model(model_pipeline, X_train, y_train)

    

if __name__ == "__main__":
    main()

LOAD AND  EXPLORE DATASET
Shape of the dataset
(9997, 14)

Check for missing values
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
EstimatedSalary    0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
Exited             0
ServiceRating      0
dtype: int64

First five rows:
   CustomerId   Surname  CreditScore Geography  Gender   Age  Tenure  \
0    15634602  Hargrave          619    France  Female  42.0       2   
1    15647311      Hill          608     Spain  Female  41.0       1   
2    15619304      Onio          502    France  Female  42.0       8   
3    15701354      Boni          699    France  Female  39.0       1   
4    15737888  Mitchell          850     Spain  Female  43.0       2   

   EstimatedSalary    Balance  NumOfProducts HasCrCard IsActiveMember  Exited  \
0        101348.88       0.00              1       Yes            Yes       1