In [1]:
#!pip install --upgrade seaborn matplotlib
import pandas as pd
import os
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler


**Loading The Data**

In [2]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', '..','data', 'train.csv')
train_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [3]:
relative_path = os.path.join('..', '..','data', 'test.csv')
test_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [4]:
x_train = train_data.drop(["Attrition"], axis = 1)
y_train = train_data["Attrition"]

x_test = test_data.drop(["Attrition"], axis = 1)
y_test = test_data["Attrition"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

In [6]:
# Calculate the ratio of negative samples to positive samples
ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

**Trying different Learning Rates (eta)**

In [11]:
learning_rates = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1]
for lr in learning_rates:
    params = {
        'eta': lr,                   # Learning rate (step size shrinkage)
        'max_depth': 6,               # Maximum depth of a tree
        'gamma': 0.1,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': 100,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': 1,                  # L2 regularization term on weights
        'alpha': 0,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For learning rate :", lr)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')


For learning rate : 0.01
Average Accuracy: 0.7659770114942529
Average F1 Score: 0.17444444444444446
Average Precision: 0.15178571428571427
Average Recall: 0.22333333333333333
Average Training Time (seconds): 0.3886226177215576
-----------------------------------------------------
For learning rate : 0.05
Average Accuracy: 0.8434482758620689
Average F1 Score: 0.21746031746031744
Average Precision: 0.22333333333333333
Average Recall: 0.21833333333333332
Average Training Time (seconds): 0.22903568744659425
-----------------------------------------------------
For learning rate : 0.1
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.19183757305145263
-----------------------------------------------------
For learning rate : 0.3
Average Accuracy: 0.8331034482758621
Average F1 Score: 0.19025252525252526
Average Precision: 0.19357142857142856
Average Recall: 0.198333333333333

**Trying Different Number Of Trees (num_boost_round)**

In [12]:
num_boost_round_list = [50, 100, 200, 300, 500, 800, 1000]
for num_boost_round in num_boost_round_list:
    params = {
        'eta': 0.1,                   # Learning rate (step size shrinkage)
        'max_depth': 6,               # Maximum depth of a tree
        'gamma': 0.1,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': num_boost_round,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': 1,                  # L2 regularization term on weights
        'alpha': 0,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For Number of trees :", num_boost_round)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')


For Number of trees : 50
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.6980184316635132
-----------------------------------------------------
For Number of trees : 100
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.24061594009399415
-----------------------------------------------------
For Number of trees : 200
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.30039396286010744
-----------------------------------------------------
For Number of trees : 300
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.34

**Trying Different Trees Depth (max_depth)**

In [13]:
max_depth_list = [2, 4, 6, 8, 10, 12, 14, 16]
for max_depth in max_depth_list:
    params = {
        'eta': 0.1,                   # Learning rate (step size shrinkage)
        'max_depth': max_depth,               # Maximum depth of a tree
        'gamma': 0.1,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': 100,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': 1,                  # L2 regularization term on weights
        'alpha': 0,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For max_depth:", max_depth)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')


For max_depth: 2
Average Accuracy: 0.7897701149425288
Average F1 Score: 0.21207070707070708
Average Precision: 0.2542857142857143
Average Recall: 0.22333333333333333
Average Training Time (seconds): 0.19703178405761718
-----------------------------------------------------
For max_depth: 4
Average Accuracy: 0.8267816091954023
Average F1 Score: 0.1773809523809524
Average Precision: 0.18666666666666665
Average Recall: 0.17333333333333334
Average Training Time (seconds): 0.31042745113372805
-----------------------------------------------------
For max_depth: 6
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.20820105075836182
-----------------------------------------------------
For max_depth: 8
Average Accuracy: 0.8502298850574712
Average F1 Score: 0.21904761904761907
Average Precision: 0.22666666666666666
Average Recall: 0.21833333333333332
Average Training Time (secon

**Trying Different combinations of max_depth & num_boost_round**

In [7]:
num_boost_round_list = [50, 100, 200, 300, 500, 800, 1000]
max_depth_list = [2, 4, 6, 8, 10, 12, 14, 16]

max_f1_score = 0
best_accuracy = 0
best_max_depth = None
best_num_boost_round = None

for num_boost_round in num_boost_round_list:
    for max_depth in max_depth_list:
        params = {
            'eta': 0.1,                   # Learning rate (step size shrinkage)
            'max_depth': max_depth,               # Maximum depth of a tree
            'gamma': 0.1,                    # Minimum loss reduction required to make a further partition on a leaf node
            'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
            'num_boost_round': num_boost_round,       # Number of boosting rounds (trees) to run
            'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
            'lambda': 1,                  # L2 regularization term on weights
            'alpha': 0,                   # L1 regularization term on weights
            
            'eval_metric': 'error',           # Evaluation metric used during training
            'booster': 'gbtree',          # Type of boosting model

            'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
            'objective': 'binary:logistic',  # Learning task and corresponding objective function
            'verbosity': 0,               # Verbosity of output messages
        }


        # Initialize CatBoost classifier
        xgb_model = xgb.XGBClassifier(**params)

        # Train the model using KFold cross-validation
        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []
        training_times = []

        for train_index, val_index in k_fold.split(x_train):
            X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
            
            # Record start time
            start_time = time.time()

            # Fit the model
            xgb_model.fit(X_train_fold, y_train_fold)

            # Record end time
            end_time = time.time()

            # Calculate training time
            training_time = end_time - start_time
            training_times.append(training_time)
            
            # Predict on validation set
            y_pred = xgb_model.predict(X_val_fold)
            
            # Calculate metrics
            accuracy = accuracy_score(y_val_fold, y_pred)
            precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
            recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
            f1 = f1_score(y_val_fold, y_pred)
            
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1)


        # Calculate and print average metrics
        avg_accuracy = sum(accuracies) / len(accuracies)
        avg_precision = sum(precisions) / len(precisions)
        avg_recall = sum(recalls) / len(recalls)
        avg_f1_score = sum(f1_scores) / len(f1_scores)

        if avg_f1_score > max_f1_score:
            max_f1_score = avg_f1_score
            best_accuracy = avg_accuracy
            best_num_boost_round = num_boost_round
            best_max_depth = max_depth

# Print the results for the best F1 score
print("Best F1 Score:", max_f1_score)
print("Corresponding Accuracy:", best_accuracy)
print("Corresponding num_boost_round:", num_boost_round)
print("Corresponding max_depth:", best_max_depth)

Best F1 Score: 0.23621933621933625
Corresponding Accuracy: 0.8504597701149426
Corresponding num_boost_round: 1000
Corresponding max_depth: 6


**Trying Different gamma**

In [8]:
gamma_list = [0.001, 0.01, 0.1, 1, 5, 10, 0]
for gamma in gamma_list:
    params = {
        'eta': 0.1,                   # Learning rate (step size shrinkage)
        'max_depth': 6,               # Maximum depth of a tree
        'gamma': gamma,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': 100,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': 1,                  # L2 regularization term on weights
        'alpha': 0,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For gamma:", gamma)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')

For gamma: 0.001
Average Accuracy: 0.8470114942528737
Average F1 Score: 0.23185425685425684
Average Precision: 0.22833333333333333
Average Recall: 0.23833333333333334
Average Training Time (seconds): 0.36702957153320315
-----------------------------------------------------
For gamma: 0.01
Average Accuracy: 0.8470114942528735
Average F1 Score: 0.2219047619047619
Average Precision: 0.22666666666666666
Average Recall: 0.21833333333333332
Average Training Time (seconds): 0.39857404232025145
-----------------------------------------------------
For gamma: 0.1
Average Accuracy: 0.8504597701149426
Average F1 Score: 0.23621933621933625
Average Precision: 0.24
Average Recall: 0.23833333333333334
Average Training Time (seconds): 1.7477010488510132
-----------------------------------------------------
For gamma: 1
Average Accuracy: 0.8299999999999998
Average F1 Score: 0.21833333333333332
Average Precision: 0.20166666666666666
Average Recall: 0.24333333333333332
Average Training Time (seconds): 0.

**Trying Different values for lambda regularization parameter**

In [9]:
lambda_list = [0.001, 0.01, 0.1, 1, 10, 100]
for lambda_l2 in lambda_list:
    params = {
        'eta': 0.1,                   # Learning rate (step size shrinkage)
        'max_depth': 6,               # Maximum depth of a tree
        'gamma': 5,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': 100,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': lambda_l2,                  # L2 regularization term on weights
        'alpha': 0,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For lambda:", lambda_l2)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')

For lambda: 0.001
Average Accuracy: 0.7788505747126437
Average F1 Score: 0.1808080808080808
Average Precision: 0.15583333333333332
Average Recall: 0.22333333333333333
Average Training Time (seconds): 0.375748348236084
-----------------------------------------------------
For lambda: 0.01
Average Accuracy: 0.772183908045977
Average F1 Score: 0.1758080808080808
Average Precision: 0.15011904761904762
Average Recall: 0.22333333333333333
Average Training Time (seconds): 0.22609076499938965
-----------------------------------------------------
For lambda: 0.1
Average Accuracy: 0.7857471264367817
Average F1 Score: 0.19166666666666668
Average Precision: 0.16202380952380951
Average Recall: 0.24333333333333332
Average Training Time (seconds): 0.35608491897583006
-----------------------------------------------------
For lambda: 1
Average Accuracy: 0.782528735632184
Average F1 Score: 0.2363869463869464
Average Precision: 0.18845238095238095
Average Recall: 0.32166666666666666
Average Training Time

**Trying Different values for alpha regularization parameter**

In [10]:
alpha_list = [0, 0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
    params = {
        'eta': 0.1,                   # Learning rate (step size shrinkage)
        'max_depth': 6,               # Maximum depth of a tree
        'gamma': 5,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': 100,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': 100,                  # L2 regularization term on weights
        'alpha': alpha,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For alpha:", alpha)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')

For alpha: 0
Average Accuracy: 0.6435632183908047
Average F1 Score: 0.26531215860163226
Average Precision: 0.17679875679875678
Average Recall: 0.6016666666666666
Average Training Time (seconds): 0.2508460760116577
-----------------------------------------------------
For alpha: 0.001
Average Accuracy: 0.6435632183908047
Average F1 Score: 0.26531215860163226
Average Precision: 0.17679875679875678
Average Recall: 0.6016666666666666
Average Training Time (seconds): 0.20649917125701905
-----------------------------------------------------
For alpha: 0.01
Average Accuracy: 0.6435632183908047
Average F1 Score: 0.26531215860163226
Average Precision: 0.17679875679875678
Average Recall: 0.6016666666666666
Average Training Time (seconds): 0.20129239559173584
-----------------------------------------------------
For alpha: 0.1
Average Accuracy: 0.6401149425287356
Average F1 Score: 0.2620928778823516
Average Precision: 0.17432400932400932
Average Recall: 0.6016666666666666
Average Training Time (s

**Trying Different values for min_child_weight**

In [13]:
min_child_weight_list = [0.1, 0.5, 1, 5, 10, 20, 40, 60, 80, 100]
for min_child_weight in min_child_weight_list:
    params = {
        'eta': 0.1,                   # Learning rate (step size shrinkage)
        'max_depth': 6,               # Maximum depth of a tree
        'gamma': 5,                    # Minimum loss reduction required to make a further partition on a leaf node
        'min_child_weight': min_child_weight,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
        'num_boost_round': 100,       # Number of boosting rounds (trees) to run
        'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
        'lambda': 100,                  # L2 regularization term on weights
        'alpha': 0,                   # L1 regularization term on weights
        
        'eval_metric': 'error',           # Evaluation metric used during training
        'booster': 'gbtree',          # Type of boosting model

        'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
        'objective': 'binary:logistic',  # Learning task and corresponding objective function
        'verbosity': 0,               # Verbosity of output messages
    }


    # Initialize CatBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Train the model using KFold cross-validation
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []

    for train_index, val_index in k_fold.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Record start time
        start_time = time.time()

        # Fit the model
        xgb_model.fit(X_train_fold, y_train_fold)

        # Record end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time
        training_times.append(training_time)
        
        # Predict on validation set
        y_pred = xgb_model.predict(X_val_fold)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
        f1 = f1_score(y_val_fold, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)


    # Calculate and print average metrics
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    # Calculate average training time
    avg_training_time = sum(training_times) / len(training_times)

    print("For min_child_weight:", min_child_weight)
    print("Average Accuracy:", avg_accuracy)
    print("Average F1 Score:", avg_f1_score)
    print("Average Precision:", avg_precision)
    print("Average Recall:", avg_recall)
    print("Average Training Time (seconds):", avg_training_time)
    print('-----------------------------------------------------')

For min_child_weight: 0.1
Average Accuracy: 0.6435632183908047
Average F1 Score: 0.26531215860163226
Average Precision: 0.17679875679875678
Average Recall: 0.6016666666666666
Average Training Time (seconds): 0.2711900234222412
-----------------------------------------------------
For min_child_weight: 0.5
Average Accuracy: 0.6435632183908047
Average F1 Score: 0.26531215860163226
Average Precision: 0.17679875679875678
Average Recall: 0.6016666666666666
Average Training Time (seconds): 0.22164499759674072
-----------------------------------------------------
For min_child_weight: 1
Average Accuracy: 0.6435632183908047
Average F1 Score: 0.26531215860163226
Average Precision: 0.17679875679875678
Average Recall: 0.6016666666666666
Average Training Time (seconds): 0.2967196226119995
-----------------------------------------------------
For min_child_weight: 5
Average Accuracy: 0.6401149425287356
Average F1 Score: 0.26281215860163226
Average Precision: 0.17452602952602952
Average Recall: 0.60

**Trying Different Combinations Of Parameters**

In [8]:
learning_rates = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1]
num_boost_round_list = [50, 100, 200, 300, 500, 800, 1000]
max_depth_list = [2, 4, 6, 8, 10, 12, 14, 16]
gamma_list = [0.001, 0.01, 0.1, 1, 5, 10, 0]
lambda_list = [0.001, 0.01, 0.1, 1, 10, 100]
alpha_list = [0, 0.001, 0.01, 0.1, 1, 10, 100]
min_child_weight_list = [0.1, 0.5, 1, 5, 10, 20, 40, 60, 80, 100]
eval_metric_list = ['error', 'logloss', 'auc', 'aucpr']
objective_list = ['binary:logistic', 'binary:logitraw', 'binary:hinge']
booster_list = ['gbtree', 'gblinear', 'dart']

max_f1_score = 0
best_accuracy = 0
best_accuracy = None
best_num_boost_round = None
best_max_depth = None
best_lr = None
best_gamma = None
best_lambda = None
best_alpha = None
best_min_child_weight = None
best_eval_metric = None
best_objective = None
best_booster = None

for lr in learning_rates:
    for num_boost_round in num_boost_round_list:
        for max_depth in max_depth_list:
            for gamma in gamma_list:
                for lambda_l2 in lambda_list:
                    for alpha in alpha_list:
                        for min_child_weight in min_child_weight_list:
                            for eval_metric in eval_metric_list:
                                for objective in objective_list:
                                    for booster in booster_list:
                                        params = {
                                            'eta': lr,                   # Learning rate (step size shrinkage)
                                            'max_depth': max_depth,               # Maximum depth of a tree
                                            'gamma': gamma,                    # Minimum loss reduction required to make a further partition on a leaf node
                                            'min_child_weight': min_child_weight,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
                                            'num_boost_round': num_boost_round,       # Number of boosting rounds (trees) to run
                                            'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
                                            'lambda': lambda_l2,                  # L2 regularization term on weights
                                            'alpha': alpha,                   # L1 regularization term on weights
                                            
                                            'eval_metric': eval_metric,           # Evaluation metric used during training
                                            'booster': booster,          # Type of boosting model

                                            'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
                                            'objective': objective,  # Learning task and corresponding objective function
                                            'verbosity': 0,               # Verbosity of output messages
                                        }


                                        # Initialize CatBoost classifier
                                        xgb_model = xgb.XGBClassifier(**params)

                                        # Train the model using KFold cross-validation
                                        accuracies = []
                                        precisions = []
                                        recalls = []
                                        f1_scores = []
                                        training_times = []

                                        for train_index, val_index in k_fold.split(x_train):
                                            X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
                                            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                                            
                                            # Record start time
                                            start_time = time.time()

                                            # Fit the model
                                            xgb_model.fit(X_train_fold, y_train_fold)

                                            # Record end time
                                            end_time = time.time()

                                            # Calculate training time
                                            training_time = end_time - start_time
                                            training_times.append(training_time)
                                            
                                            # Predict on validation set
                                            y_pred = xgb_model.predict(X_val_fold)
                                            
                                            # Calculate metrics
                                            accuracy = accuracy_score(y_val_fold, y_pred)
                                            precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
                                            recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
                                            f1 = f1_score(y_val_fold, y_pred)
                                            
                                            accuracies.append(accuracy)
                                            precisions.append(precision)
                                            recalls.append(recall)
                                            f1_scores.append(f1)


                                        # Calculate and print average metrics
                                        avg_accuracy = sum(accuracies) / len(accuracies)
                                        avg_precision = sum(precisions) / len(precisions)
                                        avg_recall = sum(recalls) / len(recalls)
                                        avg_f1_score = sum(f1_scores) / len(f1_scores)

                                        if avg_f1_score > max_f1_score:
                                            max_f1_score = avg_f1_score
                                            best_accuracy = avg_accuracy
                                            best_num_boost_round = num_boost_round
                                            best_max_depth = max_depth
                                            best_lr = lr
                                            best_gamma = gamma
                                            best_lambda = lambda_l2
                                            best_alpha = alpha
                                            best_min_child_weight = min_child_weight
                                            best_eval_metric = eval_metric
                                            best_objective = objective
                                            best_booster = booster


# Print the results for the best F1 score
print("Best F1 Score:", max_f1_score)
print("Corresponding Accuracy:", best_accuracy)

print("Corresponding num_boost_round:", best_num_boost_round)
print("Corresponding max_depth:", best_max_depth)
print("Corresponding learning rate:", best_lr)
print("Corresponding gamma:", best_gamma)
print("Corresponding lambda:", best_lambda)
print("Corresponding alpha:", best_alpha)
print("Corresponding min_child_weight:", best_min_child_weight)
print("Corresponding eval_metric:", best_eval_metric)
print("Corresponding objective:", best_objective)
print("Corresponding booster:", best_booster)



**Evaluating the model on the train set using the best used parameters**

In [None]:
params = {
    'eta': best_lr,                   # Learning rate (step size shrinkage)
    'max_depth': best_max_depth,               # Maximum depth of a tree
    'gamma': best_gamma,                    # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': best_min_child_weight,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
    'num_boost_round': best_num_boost_round,       # Number of boosting rounds (trees) to run
    'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
    'lambda': best_lambda,                  # L2 regularization term on weights
    'alpha': best_alpha,                   # L1 regularization term on weights
    
    'eval_metric': best_eval_metric,           # Evaluation metric used during training
    'booster': best_booster,          # Type of boosting model

    'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
    'objective': best_objective,  # Learning task and corresponding objective function
    'verbosity': 0,               # Verbosity of output messages
}


# Initialize CatBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()

    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = xgb_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    f1 = f1_score(y_val_fold, y_pred)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)


# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Validation Average Accuracy:", avg_accuracy)
print("Validation Average F1 Score:", avg_f1_score)
print("Validation Average Precision:", avg_precision)
print("Validation Average Recall:", avg_recall)
print("Validation Average Training Time (seconds):", avg_training_time)
print('-----------------------------------------------------')

    # Predict labels for the test set
y_pred_test = xgb_model.predict(x_test)

# Calculate evaluation metrics
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

# Print evaluation metrics
print("Test Accuracy:", accuracy_test)
print("Test F1 Score:", f1_score_test)
print("Test Precision:", precision_test)
print("Test Recall:", recall_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, 
            xticklabels=["Not Attrition", "Attrition"], 
            yticklabels=["Not Attrition", "Attrition"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix (Test Set)")
plt.show()

**Trying Resampling**

Random oversampling

In [None]:
# Random oversampling
oversampler = RandomOverSampler(random_state=42)
x_train_resampled, y_train_resampled = oversampler.fit_resample(x_train, y_train)

In [None]:
params = {
    'eta': best_lr,                   # Learning rate (step size shrinkage)
    'max_depth': best_max_depth,               # Maximum depth of a tree
    'gamma': best_gamma,                    # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': best_min_child_weight,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
    'num_boost_round': best_num_boost_round,       # Number of boosting rounds (trees) to run
    'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
    'lambda': best_lambda,                  # L2 regularization term on weights
    'alpha': best_alpha,                   # L1 regularization term on weights
    
    'eval_metric': best_eval_metric,           # Evaluation metric used during training
    'booster': best_booster,          # Type of boosting model

    'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
    'objective': best_objective,  # Learning task and corresponding objective function
    'verbosity': 0,               # Verbosity of output messages
}


# Initialize CatBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train_resampled):
    X_train_fold, X_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]
    
    # Record start time
    start_time = time.time()

    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = xgb_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    f1 = f1_score(y_val_fold, y_pred)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)


# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Validation Average Accuracy:", avg_accuracy)
print("Validation Average F1 Score:", avg_f1_score)
print("Validation Average Precision:", avg_precision)
print("Validation Average Recall:", avg_recall)
print("Validation Average Training Time (seconds):", avg_training_time)
print('-----------------------------------------------------')

    # Predict labels for the test set
y_pred_test = xgb_model.predict(x_test)

# Calculate evaluation metrics
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

# Print evaluation metrics
print("Test Accuracy:", accuracy_test)
print("Test F1 Score:", f1_score_test)
print("Test Precision:", precision_test)
print("Test Recall:", recall_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, 
            xticklabels=["Not Attrition", "Attrition"], 
            yticklabels=["Not Attrition", "Attrition"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix (Test Set)")
plt.show()

SMOTE

In [None]:
# SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
params = {
    'eta': best_lr,                   # Learning rate (step size shrinkage)
    'max_depth': best_max_depth,               # Maximum depth of a tree
    'gamma': best_gamma,                    # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': best_min_child_weight,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
    'num_boost_round': best_num_boost_round,       # Number of boosting rounds (trees) to run
    'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
    'lambda': best_lambda,                  # L2 regularization term on weights
    'alpha': best_alpha,                   # L1 regularization term on weights
    
    'eval_metric': best_eval_metric,           # Evaluation metric used during training
    'booster': best_booster,          # Type of boosting model

    'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
    'objective': best_objective,  # Learning task and corresponding objective function
    'verbosity': 0,               # Verbosity of output messages
}


# Initialize CatBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train_resampled):
    X_train_fold, X_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]
    
    # Record start time
    start_time = time.time()

    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = xgb_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    f1 = f1_score(y_val_fold, y_pred)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)


# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Validation Average Accuracy:", avg_accuracy)
print("Validation Average F1 Score:", avg_f1_score)
print("Validation Average Precision:", avg_precision)
print("Validation Average Recall:", avg_recall)
print("Validation Average Training Time (seconds):", avg_training_time)
print('-----------------------------------------------------')

    # Predict labels for the test set
y_pred_test = xgb_model.predict(x_test)

# Calculate evaluation metrics
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

# Print evaluation metrics
print("Test Accuracy:", accuracy_test)
print("Test F1 Score:", f1_score_test)
print("Test Precision:", precision_test)
print("Test Recall:", recall_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, 
            xticklabels=["Not Attrition", "Attrition"], 
            yticklabels=["Not Attrition", "Attrition"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix (Test Set)")
plt.show()

Random undersampling

In [None]:
# Random undersampling
undersampler = RandomUnderSampler(random_state=42)
x_train_resampled, y_train_resampled = undersampler.fit_resample(x_train, y_train)

In [None]:
params = {
    'eta': best_lr,                   # Learning rate (step size shrinkage)
    'max_depth': best_max_depth,               # Maximum depth of a tree
    'gamma': best_gamma,                    # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': best_min_child_weight,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
    'num_boost_round': best_num_boost_round,       # Number of boosting rounds (trees) to run
    'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
    'lambda': best_lambda,                  # L2 regularization term on weights
    'alpha': best_alpha,                   # L1 regularization term on weights
    
    'eval_metric': best_eval_metric,           # Evaluation metric used during training
    'booster': best_booster,          # Type of boosting model

    'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
    'objective': best_objective,  # Learning task and corresponding objective function
    'verbosity': 0,               # Verbosity of output messages
}


# Initialize CatBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train_resampled):
    X_train_fold, X_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]
    
    # Record start time
    start_time = time.time()

    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = xgb_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    f1 = f1_score(y_val_fold, y_pred)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)


# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Validation Average Accuracy:", avg_accuracy)
print("Validation Average F1 Score:", avg_f1_score)
print("Validation Average Precision:", avg_precision)
print("Validation Average Recall:", avg_recall)
print("Validation Average Training Time (seconds):", avg_training_time)
print('-----------------------------------------------------')

    # Predict labels for the test set
y_pred_test = xgb_model.predict(x_test)

# Calculate evaluation metrics
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

# Print evaluation metrics
print("Test Accuracy:", accuracy_test)
print("Test F1 Score:", f1_score_test)
print("Test Precision:", precision_test)
print("Test Recall:", recall_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, 
            xticklabels=["Not Attrition", "Attrition"], 
            yticklabels=["Not Attrition", "Attrition"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix (Test Set)")
plt.show()