In [None]:
%pip install catboost

In [3]:
# ============================================
# HR Employee Attrition - Feature Engineering
# ============================================

import pandas as pd
import numpy as np
import pickle, os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# =====================
# 1. Load Data
# =====================
df = pd.read_csv("HR-Employee-Attrition.csv")

# ---------------------
# Feature Engineering
# ---------------------
# Encode Over18 to binary (Y/N)
if "Over18" in df.columns:
    df["Over18"] = df["Over18"].map({"Y": 1, "N": 0})

# Age bands
df["AgeBand"] = pd.cut(df["Age"], bins=[17, 25, 35, 45, 55, 65], labels=False)

# Tenure ratio: YearsAtCompany vs. TotalWorkingYears
df["TenureRatio"] = df["YearsAtCompany"] / (df["TotalWorkingYears"] + 1)

# Interaction: MonthlyIncome per Year of Age
df["IncomePerAge"] = df["MonthlyIncome"] / (df["Age"] + 1)

# Scale continuous variables (example: MonthlyIncome)
scaler = MinMaxScaler()
df["MonthlyIncomeScaled"] = scaler.fit_transform(df[["MonthlyIncome"]])

# Encode remaining categorical columns
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

# =====================
# 2. Split Data
# =====================
X = df.drop("Attrition", axis=1)
y = df["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# =====================
# 3. Define Models
# =====================
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=6, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42
    ),
    "CatBoost": CatBoostClassifier(iterations=100, verbose=0, random_state=42),
}

# =====================
# 4. Train, Evaluate, Save
# =====================
os.makedirs("models", exist_ok=True)
results = {}
best_model_name = None
best_model = None
best_acc = 0.0

for name, model in models.items():
    print(f"\n================= {name} =================")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"✅ Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    # Save individual model
    with open(f"models/{name.replace(' ', '_')}.pkl", "wb") as f:
        pickle.dump(model, f)
    # Track best
    if acc > best_acc:
        best_acc = acc
        best_model = model
        best_model_name = name

# Save best model
with open("models/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# =====================
# 5. Summary
# =====================
print("\n🔎 Model Performance Summary:")
for model, acc in results.items():
    print(f"{model:15s} -> Accuracy: {acc:.4f}")

print(f"\n🏆 Best Model: {best_model_name} with Accuracy: {best_acc:.4f}")
print("✅ Best model saved as models/best_model.pkl")



✅ Accuracy: 0.8163
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       247
           1       0.40      0.30      0.34        47

    accuracy                           0.82       294
   macro avg       0.64      0.61      0.62       294
weighted avg       0.80      0.82      0.81       294


✅ Accuracy: 0.8299
              precision    recall  f1-score   support

           0       0.85      0.96      0.90       247
           1       0.40      0.13      0.19        47

    accuracy                           0.83       294
   macro avg       0.63      0.55      0.55       294
weighted avg       0.78      0.83      0.79       294


✅ Accuracy: 0.8571
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       247
           1       0.60      0.32      0.42        47

    accuracy                           0.86       294
   macro avg       0.74      0.64      0.67       294
weighted avg  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Accuracy: 0.8571
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       247
           1       0.61      0.30      0.40        47

    accuracy                           0.86       294
   macro avg       0.74      0.63      0.66       294
weighted avg       0.84      0.86      0.84       294


✅ Accuracy: 0.8571
              precision    recall  f1-score   support

           0       0.86      0.98      0.92       247
           1       0.69      0.19      0.30        47

    accuracy                           0.86       294
   macro avg       0.78      0.59      0.61       294
weighted avg       0.84      0.86      0.82       294


🔎 Model Performance Summary:
Decision Tree   -> Accuracy: 0.8163
Random Forest   -> Accuracy: 0.8299
AdaBoost        -> Accuracy: 0.8571
XGBoost         -> Accuracy: 0.8571
CatBoost        -> Accuracy: 0.8571

🏆 Best Model: AdaBoost with Accuracy: 0.8571
✅ Best model saved as models/best_model.pkl
