In [9]:
#!pip install xgboost
import pandas as pd
import os
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_validate
import time


**Loading The Data**

In [2]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', '..','data', 'train.csv')
train_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [3]:
relative_path = os.path.join('..', '..','data', 'test.csv')
test_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [4]:
x_train = train_data.drop(["Attrition"], axis = 1)
y_train = train_data["Attrition"]

x_test = test_data.drop(["Attrition"], axis = 1)
y_test = test_data["Attrition"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

In [17]:
# Calculate the ratio of negative samples to positive samples
ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

In [18]:
params = {
    'eta': 0.3,                   # Learning rate (step size shrinkage)
    'max_depth': 6,               # Maximum depth of a tree
    'gamma': 0.1,                    # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child'subsample': 1.0,             # Subsample ratio of the training instances
    'num_boost_round': 100,       # Number of boosting rounds (trees) to run
    'colsample_bytree': 1.0,      # Subsample ratio of columns when constructing each tree
    'lambda': 1,                  # L2 regularization term on weights
    'alpha': 0,                   # L1 regularization term on weights
    
    'eval_metric': 'error',           # Evaluation metric used during training
    'booster': 'gbtree',          # Type of boosting model

    'scale_pos_weight': ratio,        # Ratio of negative samples to positive samples
    'objective': 'binary:logistic',  # Learning task and corresponding objective function
    'verbosity': 0,               # Verbosity of output messages
}


In [19]:
# Initialize CatBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)
    
    # Predict on validation set
    y_pred = xgb_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    f1 = f1_score(y_val_fold, y_pred)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)


# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print('-----------------------------------------------------')


Average Accuracy: 0.8331034482758621
Average F1 Score: 0.19025252525252526
Average Precision: 0.19357142857142856
Average Recall: 0.19833333333333333
-----------------------------------------------------
