In [73]:
import pandas as pd
import os
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time


**Loading The Data**

In [11]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', '..','data', 'train.csv')
train_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [12]:
relative_path = os.path.join('..', '..','data', 'test.csv')
test_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [13]:
x_train = train_data.drop(["Attrition"], axis = 1)
y_train = train_data["Attrition"]

x_test = test_data.drop(["Attrition"], axis = 1)
y_test = test_data["Attrition"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Trying Different Class Weights**

for class weights [1,1]

In [56]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 1]  # Adjust class weights because of unbalanced classes
}

In [57]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.8704597701149426
Average F1 Score: 0.04
Average Precision: 0.1
Average Recall: 0.025


for class weights [1,5]

In [46]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 5]  # Adjust class weights because of unbalanced classes
}

In [47]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.8437931034482758
Average F1 Score: 0.40715728715728716
Average Precision: 0.4419047619047619
Average Recall: 0.44416666666666665


For class wieghts [1,10]

In [48]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [49]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.793103448275862
Average F1 Score: 0.5559499082900619
Average Precision: 0.4786538461538462
Average Recall: 0.8633333333333333


For class weights [1,15]

In [54]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 15]  # Adjust class weights because of unbalanced classes
}

In [55]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.6836781609195403
Average F1 Score: 0.5201533538146442
Average Precision: 0.419845372019285
Average Recall: 0.9291666666666666


For class wieghts [1,20]

In [50]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 20]  # Adjust class weights because of unbalanced classes
}

In [51]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)

Average Accuracy: 0.6757471264367816
Average F1 Score: 0.48895497827439743
Average Precision: 0.37577540106951873
Average Recall: 0.9291666666666666


**Trying Different Learning Rates**

For learning rate = 0.01

In [75]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.01,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [76]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.747816091954023
Average F1 Score: 0.47449381536338053
Average Precision: 0.35378193701723115
Average Recall: 0.8383333333333333
Average Training Time (seconds): 0.4982882261276245


For learning rate = 0.05

In [77]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.05,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [78]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.7766666666666666
Average F1 Score: 0.5323148178902655
Average Precision: 0.43923243423243424
Average Recall: 0.8633333333333333
Average Training Time (seconds): 0.46749582290649416


For learning rate = 0.1

In [79]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.1,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [80]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.793103448275862
Average F1 Score: 0.5559499082900619
Average Precision: 0.4786538461538462
Average Recall: 0.8633333333333333
Average Training Time (seconds): 0.5149653434753418


For learning rate = 0.3

In [81]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.3,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [82]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.7562068965517241
Average F1 Score: 0.41913838647853996
Average Precision: 0.3427790346907994
Average Recall: 0.6983333333333335
Average Training Time (seconds): 0.4889732599258423


For learning rate = 0.5

In [83]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.5,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [84]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.7164367816091953
Average F1 Score: 0.44377921950172594
Average Precision: 0.3470054945054945
Average Recall: 0.8183333333333334
Average Training Time (seconds): 0.5634777784347534


For learning rate = 0.7

In [85]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.7,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [86]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.7549425287356322
Average F1 Score: 0.44848959082974427
Average Precision: 0.37190559440559434
Average Recall: 0.7783333333333334
Average Training Time (seconds): 0.4836781740188599


For learning rate = 0.9

In [87]:
# Define CatBoost parameters
params = {
    'learning_rate': 0.9,               # Learning Rate
    'n_estimators': 100,                # Number of Trees
    'max_depth': 6,                     # Depth of Trees
    'l2_leaf_reg': 3,                   # Regularization Parameter: L2 regularization
    'min_child_samples': 5,             # Regularization Parameter: Minimum number of samples required to split a node
    'subsample': 0.8,                   # Subsampling
    'loss_function': 'Logloss',         # Objective Function
    'eval_metric': 'Accuracy',                # Evaluation Metric
    'border_count': 128,                 # Gradient Estimation
    'class_weights': [1, 10]  # Adjust class weights because of unbalanced classes
}

In [88]:
# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(**params)

# Train the model using KFold cross-validation
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

for train_index, val_index in k_fold.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Record start time
    start_time = time.time()
    
    # Fit the model
    catboost_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    
    # Record end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)
    
    # Predict on validation set
    y_pred = catboost_model.predict(X_val_fold)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, zero_division=0)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)

print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.6517241379310346
Average F1 Score: 0.37156410890426234
Average Precision: 0.27005183222574525
Average Recall: 0.8116666666666668
Average Training Time (seconds): 0.4952423095703125
