In [9]:
#!pip install xgboost
import pandas as pd
import os
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_validate
import time


**Loading The Data**

In [2]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', '..','data', 'train.csv')
train_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [3]:
relative_path = os.path.join('..', '..','data', 'test.csv')
test_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [4]:
x_train = train_data.drop(["Attrition"], axis = 1)
y_train = train_data["Attrition"]

x_test = test_data.drop(["Attrition"], axis = 1)
y_test = test_data["Attrition"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

In [6]:
# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Evaluation metric
    'eta': 0.1,                        # Learning rate
    'max_depth': 3,                    # Maximum depth of each tree
    'min_child_weight': 1,             # Minimum sum of instance weight needed in a child
    'subsample': 0.8,                  # Subsample ratio of the training instances
    'colsample_bytree': 0.8,           # Subsample ratio of columns when constructing each tree
    'alpha': 0.01,                     # L1 regularization term on weights
    'lambda': 0.01,                    # L2 regularization term on weights
    'seed': 42                         # Random seed for reproducibility
}

In [15]:
# Initialize CatBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)
    
    # Predict on validation set
    y_pred = xgb_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    recall = recall_score(y_val_fold, y_pred, zero_division=0)  # Set zero_division parameter here
    f1 = f1_score(y_val_fold, y_pred)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)


# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print('-----------------------------------------------------')


Average Accuracy: 0.8571264367816092
Average F1 Score: 0.1841269841269841
Average Precision: 0.22333333333333333
Average Recall: 0.16499999999999998
-----------------------------------------------------
