## Model Training for Employee Performance Prediction

In [3]:
# Installing module
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for lightgbm: filename=lightgbm-4.6.0-py3-none-linux_x86_64.whl size=2737776 sha256=eeb03bf869618ac3acd6f7173e1af6ce7688888cec6abd18d56e6eaddeb34cf0
  Stored in directory: /home/ec2-user/.cache/pip/wheels/bb/db/6d/7814aed03437129dc284a055c084f201b765deb54b6908efab
Successfully built lightgbm
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import joblib

In [6]:
# Importing data, Encoding and splitting data for the model
data_path = "/home/ec2-user/SageMaker/data/EmployeeData_Raw.csv"

try:
    employee_df = pd.read_csv(data_path)
    X = employee_df.drop(['PerformanceRating', 'EmpNumber'], axis=1)
    y = employee_df['PerformanceRating']

    # One hot encoding of categorical features
    categorical_features = X.select_dtypes(include='object').columns
    X_processed = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    # Splitting data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)
    print("Data loaded, processed and split into training and testing sets.")
except FileNotFoundError:
    print(f"Error: {data_path} not found")
    X_train, X_test, y_train, y_test = None, None, None, None

Data loaded, processed and split into training and testing sets.


### Algorithm selection

Several classification algorithm are used to predict the Performance Rating. The classification tasks include:
- Logistic regression
- Random Forest Classifier
- LightGBM Classifier

In [9]:
# Model training using selected model
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lgbm_classifier = lgb.LGBMClassifier(random_state=42)

# Training models
print("Training logistic regression...")
logistic_regression.fit(X_train, y_train)
print("Logistic regression model trained.")

print("Training random forest classifier...")
rf_classifier.fit(X_train, y_train)
print("Random forest classifier model trained.")

print("Training lightGBM classifier...")
lgbm_classifier.fit(X_train, y_train)
print("lightGBM classifier model trained.")

print("\nAll models trained successfully.")

Training logistic regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic regression model trained.
Training random forest classifier...
Random forest classifier model trained.
Training lightGBM classifier...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 394
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 48
[LightGBM] [Info] Start training from score -1.823508
[LightGBM] [Info] Start training from score -0.317283
[LightGBM] [Info] Start training from score -2.203494
lightGBM classifier model trained.

All models trained successfully.


In [10]:
# Hypaparemeter tuning to get optimal output from the model
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf_clf = grid_search_rf.best_estimator_
print(f"Best Random Forest parameters: {grid_search_rf.best_params_}")

Best Random Forest parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


In [13]:
# Saving trained models
try:
    model_dir = '/home/ec2-user/SageMaker/src/Models'
    import os
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    joblib.dump(logistic_regression, os.path.join(model_dir, 'logistic_regression_model.joblib'))
    print("Saved Logistic Regression model.")

    joblib.dump(rf_classifier, os.path.join(model_dir, 'random_forest_model.joblib'))
    print("Saved Random Forest model.")

    joblib.dump(lgbm_classifier, os.path.join(model_dir, 'lightgbm_model.joblib'))
    print("Saved LightGBM model.")

    print(f"Trained models saved to '{model_dir}' directory.")
except Exception as e:
    print(f"Error saving models: {e}")

Saved Logistic Regression model.
Saved Random Forest model.
Saved LightGBM model.
Trained models saved to '/home/ec2-user/SageMaker/src/Models' directory.
