In [24]:
import pandas as pd
import os
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

**Loading The Data**

In [11]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', '..','data', 'train.csv')
train_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [12]:
relative_path = os.path.join('..', '..','data', 'test.csv')
test_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [13]:
x_train = train_data.drop(["Attrition"], axis = 1)
y_train = train_data["Attrition"]

x_test = test_data.drop(["Attrition"], axis = 1)
y_test = test_data["Attrition"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Trying Different Learning Rates**

for learning rate = 0.1

In [36]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 5]  # Adjust class weights because of unbalanced classes
}

In [37]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.793103448275862
Average F1 Score: 0.5559499082900619
Average Precision: 0.4786538461538462
Average Recall: 0.8633333333333333


In [30]:
# Initialize lists to store evaluation metric scores
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Train the model using KFold cross-validation
for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Initialize CatBoost classifier
    catboost_model = CatBoostClassifier(**params)
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    
    # Append scores to lists
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

# Calculate average scores
avg_accuracy = np.mean(accuracy_scores)
avg_f1 = np.mean(f1_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)

# Print average scores
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.8636781609195403
Average F1 Score: 0.1019047619047619
Average Precision: 0.18333333333333332
Average Recall: 0.07833333333333334
