# LightGBM

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

data_path = os.environ.get('DATA_PATH') + 'AI_Cheats/'

In [3]:
# Load the training and testing data
train = pd.read_csv(data_path + 'employee_turnover/train.csv', index_col=0)
test = pd.read_csv(data_path + 'employee_turnover/test.csv', index_col=0)

# Process the Attrition column, converting 'Yes' to 1 and 'No' to 0
train['Attrition'] = train['Attrition'].map(lambda x: 1 if x == 'Yes' else 0)

# Check for missing values in each column
# print(train.isna().sum())

In [4]:
# Drop unnecessary columns: EmployeeNumber and StandardHours (since they are constant)
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

In [5]:
# Categorical features that need encoding
attr = ['Age', 'BusinessTravel', 'Department', 'Education', 'EducationField', 
        'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']

In [6]:
label_encoder = LabelEncoder()

for col in attr:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.transform(test[col])

X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition', axis=1), train['Attrition'], test_size=0.2, random_state=2023)

In [7]:
# LightGBM parameters for training
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'eta': 0.01,
    'max_depth': 6,
    'num_leaves': 8,  # Adjust based on performance
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'subsample_freq': 8,
    'alpha': 0.6,
    'lambda': 0,
    'device_type': 'cpu',
    'force_row_wise': True
}

# Prepare training and validation datasets
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid)

# Train the LightGBM model
model = lgb.train(params, trn_data, num_boost_round=100, valid_sets=[val_data], 
                  feature_name='auto', categorical_feature='auto', keep_training_booster=False)

# Use the trained model to make predictions
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

# Convert probability predictions to binary labels (0 or 1) using a threshold
threshold = 0.5
y_pred_binary = np.where(y_pred > threshold, 1, 0)

# Calculate the accuracy
accuracy = accuracy_score(y_valid, y_pred_binary)
print(f"Accuracy: {accuracy}")

[LightGBM] [Info] Number of positive: 153, number of negative: 787
[LightGBM] [Info] Total Bins 1128
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.162766 -> initscore=-1.637790
[LightGBM] [Info] Start training from score -1.637790
Accuracy: 0.864406779661017
