In [203]:
# Importing Libraries
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib
import os

In [204]:
# Prepare Data
BASE_PATH = r'C:\Users\Mayank Meghwal\Desktop\DS GUVI\Projects\Employee\Models'
path = r'C:\Users\Mayank Meghwal\Desktop\DS GUVI\Projects\Employee\Preprocessed.csv'
df = pd.read_csv(path)

In [205]:
# Drop useless columns
drop_cols = ['Unnamed: 0', 'EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours']
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

In [206]:
df['Attrition_Yes'] = df['Attrition_Yes'].astype(int)

# Features & Target
X = df.drop("Attrition_Yes", axis=1)
y = df["Attrition_Yes"]

In [207]:
# Save feature column order (important for Streamlit reindexing)
FEATURES_FILE = os.path.join(BASE_PATH, "features.pkl")
joblib.dump(X.columns.tolist(), FEATURES_FILE)
print("Feature columns saved.")

Feature columns saved.


In [208]:
# Handle imbalance with SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

In [209]:
# Scale features
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)

In [210]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_res_scaled, y_res, test_size=0.2, random_state=42, stratify=y_res)

In [211]:
# Define Models & Hyperparameters
models = {
    "LogisticRegression": (
        LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
        {"C": [0.01, 0.1, 1, 10]}
    ),

    "DecisionTree": (
        DecisionTreeClassifier(class_weight='balanced', random_state=42),
        {"max_depth": [5, 10, 15, None],
         "min_samples_split": [2, 5, 10]}
    ),

    "RandomForest": (
        RandomForestClassifier(class_weight='balanced', random_state=42),
        {"n_estimators": [100, 200, 300],
         "max_depth": [10, 20, None],
         "min_samples_split": [2, 5, 10]}
    )
}

results = {}

In [212]:
# Cross-validation for each model
for name, (model, params) in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(
        model,
        params,
        cv=StratifiedKFold(5),
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"\nBest Params for {name}: {grid.best_params_}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred))

    results[name] = {
        "model": best_model,
        "f1": grid.best_score_,
        "test_accuracy": best_model.score(X_test, y_test)
    }


Training LogisticRegression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best Params for LogisticRegression: {'C': 0.1}
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       247
           1       0.91      0.89      0.90       247

    accuracy                           0.90       494
   macro avg       0.90      0.90      0.90       494
weighted avg       0.90      0.90      0.90       494

Confusion Matrix:
 [[225  22]
 [ 27 220]]
ROC-AUC: 0.9008097165991903

Training DecisionTree...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best Params for DecisionTree: {'max_depth': 10, 'min_samples_split': 10}
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       247
           1       0.83      0.84      0.84       247

    accuracy                           0.84       494
   macro avg       0.84      0.84      0.84       494
weighted avg       0.84      0

In [213]:
# Select Best Model
best_model_name = max(results, key=lambda x: results[x]["f1"])
best_model = results[best_model_name]["model"]

print("\nBest Model Selected:", best_model_name)

MODEL_FILE = os.path.join(BASE_PATH, "Employee.pkl")
SCALER_FILE = os.path.join(BASE_PATH, "scaler.pkl")


Best Model Selected: RandomForest


In [214]:
# Save model & scaler
joblib.dump(best_model, MODEL_FILE)
print("Model saved successfully.")

joblib.dump(scaler, SCALER_FILE)
print("Scaler saved successfully.")

Model saved successfully.
Scaler saved successfully.
