In [21]:
#!pip install lightgbm
import pandas as pd
import os
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time
#!pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

**Loading The Data**

In [4]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', '..','data', 'train.csv')
train_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [5]:
relative_path = os.path.join('..', '..','data', 'test.csv')
test_data = pd.read_csv(os.path.join(current_dir, relative_path))

In [6]:
x_train = train_data.drop(["Attrition"], axis = 1)
y_train = train_data["Attrition"]

x_test = test_data.drop(["Attrition"], axis = 1)
y_test = test_data["Attrition"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

In [7]:
# Calculate the ratio of negative samples to positive samples
ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

Random oversampling

In [29]:
# Random oversampling
oversampler = RandomOverSampler(random_state=42)
x_train_resampled, y_train_resampled = oversampler.fit_resample(x_train, y_train)

In [30]:
# Define LightGBM parameters
params = {
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 6,
    'num_leaves': 65,  # Set num_leaves explicitly
    'min_child_samples': 5,
    'subsample': 0.8,
    'objective': 'binary',
    'metric': 'binary_error',
    'random_state': 42,
    'force_col_wise': True,
    'verbose': -1
}


# Initialize lists to store evaluation metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

# Train the model using k-fold cross-validation
for train_index, val_index in k_fold.split(x_train_resampled):
    X_train_fold, X_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

    # Record start time
    start_time = time.time()

    # Initialize LightGBM classifier
    lgb_model = lgb.LGBMClassifier(**params)

    # Fit the model
    lgb_model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_val_fold, y_val_fold)])

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)

    # Predict on validation set
    y_pred = lgb_model.predict(X_val_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.9666666666666668
Average F1 Score: 0.9666811571475332
Average Precision: 0.9368950564310605
Average Recall: 1.0
Average Training Time (seconds): 0.27126297950744627


SMOTE

In [31]:
# SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [32]:
# Define LightGBM parameters
params = {
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 6,
    'num_leaves': 65,  # Set num_leaves explicitly
    'min_child_samples': 5,
    'subsample': 0.8,
    'objective': 'binary',
    'metric': 'binary_error',
    'random_state': 42,
    'force_col_wise': True,
    'verbose': -1
}


# Initialize lists to store evaluation metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

# Train the model using k-fold cross-validation
for train_index, val_index in k_fold.split(x_train_resampled):
    X_train_fold, X_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

    # Record start time
    start_time = time.time()

    # Initialize LightGBM classifier
    lgb_model = lgb.LGBMClassifier(**params)

    # Fit the model
    lgb_model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_val_fold, y_val_fold)])

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)

    # Predict on validation set
    y_pred = lgb_model.predict(X_val_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.9294117647058824
Average F1 Score: 0.9259115588613753
Average Precision: 0.9193981621202386
Average Recall: 0.9370720116132409
Average Training Time (seconds): 0.3241485595703125


Random undersampling

In [33]:
# Random undersampling
undersampler = RandomUnderSampler(random_state=42)
x_train_resampled, y_train_resampled = undersampler.fit_resample(x_train, y_train)

In [34]:
# Define LightGBM parameters
params = {
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 6,
    'num_leaves': 65,  # Set num_leaves explicitly
    'min_child_samples': 5,
    'subsample': 0.8,
    'objective': 'binary',
    'metric': 'binary_error',
    'random_state': 42,
    'force_col_wise': True,
    'verbose': -1
}


# Initialize lists to store evaluation metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_times = []

# Train the model using k-fold cross-validation
for train_index, val_index in k_fold.split(x_train_resampled):
    X_train_fold, X_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

    # Record start time
    start_time = time.time()

    # Initialize LightGBM classifier
    lgb_model = lgb.LGBMClassifier(**params)

    # Fit the model
    lgb_model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_val_fold, y_val_fold)])

    # Record end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time
    training_times.append(training_time)

    # Predict on validation set
    y_pred = lgb_model.predict(X_val_fold)

    # Calculate metrics
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and print average metrics
avg_accuracy = sum(accuracies) / len(accuracies)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

# Calculate average training time
avg_training_time = sum(training_times) / len(training_times)
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average Training Time (seconds):", avg_training_time)


Average Accuracy: 0.6285714285714286
Average F1 Score: 0.60005772005772
Average Precision: 0.6449999999999999
Average Recall: 0.635
Average Training Time (seconds): 0.20353341102600098
