# HR Employee Attrition Model Training

This notebook trains the XGBoost Classifier model used for the HR Prediction App.
It covers:
1. Data Loading (or Synthetic Generation)
2. Preprocessing (Label Encoding)
3. Model Training (XGBoost)
4. Evolution
5. Exporting Model & Preprocessor

In [None]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import sys

# Add src to path
sys.path.append('../src')
from config import COLUMNS_TO_DROP, CATEGORICAL_COLUMNS, NUMERICAL_COLUMNS, MODELS_DIR, DATA_DIR

## 1. Load Data
If no data exists, we generate synthetic data for demonstration.

In [None]:
def generate_synthetic_data(n=1000):
    np.random.seed(42)
    data = {
        'Age': np.random.randint(18, 60, n),
        'DailyRate': np.random.randint(100, 1500, n),
        'DistanceFromHome': np.random.randint(1, 30, n),
        'Education': np.random.randint(1, 6, n),
        'EnvironmentSatisfaction': np.random.randint(1, 5, n),
        'HourlyRate': np.random.randint(30, 100, n),
        'JobInvolvement': np.random.randint(1, 5, n),
        'JobLevel': np.random.randint(1, 6, n),
        'JobSatisfaction': np.random.randint(1, 5, n),
        'MonthlyIncome': np.random.randint(2000, 20000, n),
        'MonthlyRate': np.random.randint(2000, 27000, n),
        'NumCompaniesWorked': np.random.randint(0, 10, n),
        'PercentSalaryHike': np.random.randint(11, 26, n),
        'PerformanceRating': np.random.randint(3, 5, n),
        'RelationshipSatisfaction': np.random.randint(1, 5, n),
        'StockOptionLevel': np.random.randint(0, 4, n),
        'TotalWorkingYears': np.random.randint(0, 40, n),
        'TrainingTimesLastYear': np.random.randint(0, 7, n),
        'WorkLifeBalance': np.random.randint(1, 5, n),
        'YearsAtCompany': np.random.randint(0, 20, n),
        'YearsInCurrentRole': np.random.randint(0, 15, n),
        'YearsSinceLastPromotion': np.random.randint(0, 15, n),
        'YearsWithCurrManager': np.random.randint(0, 15, n),
        # Categorical
        'BusinessTravel': np.random.choice(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], n),
        'Department': np.random.choice(['Sales', 'Research & Development', 'Human Resources'], n),
        'EducationField': np.random.choice(['Life Sciences', 'Medical', 'Marketing', 'Technical Degree', 'Other'], n),
        'Gender': np.random.choice(['Male', 'Female'], n),
        'JobRole': np.random.choice(['Sales Executive', 'Research Scientist', 'Laboratory Technician', 'Manufacturing Director'], n),
        'MaritalStatus': np.random.choice(['Single', 'Married', 'Divorced'], n),
        'OverTime': np.random.choice(['Yes', 'No'], n),
        # Target
        'Attrition': np.random.choice([0, 1], n, p=[0.84, 0.16])
    }
    return pd.DataFrame(data)

# Check if file exists, else create
data_path = DATA_DIR / "HR_Employee_Attrition.csv"
if data_path.exists():
    df = pd.read_csv(data_path)
else:
    print("Data file not found. Generating synthetic data...")
    df = generate_synthetic_data()
    
print(f"Data Shape: {df.shape}")
df.head()

## 2. Preprocessing

In [None]:
# Drop useless columns
df_clean = df.drop(columns=[c for c in COLUMNS_TO_DROP if c in df.columns], errors='ignore')

# Encode Categorical
encoders = {}
for col in CATEGORICAL_COLUMNS:
    if col in df_clean.columns:
        le = LabelEncoder()
        df_clean[col] = le.fit_transform(df_clean[col])
        encoders[col] = le

# Encode Target
target_col = 'Attrition'
if df_clean[target_col].dtype == 'object':
    df_clean[target_col] = df_clean[target_col].apply(lambda x: 1 if x == 'Yes' else 0)

X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]

print("Preprocessing complete.")

## 3. Train Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 4. Save Model

In [None]:
# Save Model Artifacts
MODELS_DIR.mkdir(parents=True, exist_ok=True)

model_data = {
    'model': model,
    'model_name': 'XGBoost_v1',
    'metrics': {'accuracy': accuracy_score(y_test, y_pred)}
}

preprocessor_data = {
    'preprocessor': encoders,  # In this simple case, just label encoders
    'feature_names': list(X.columns)
}

joblib.dump(model_data, MODELS_DIR / "best_model.pkl")
joblib.dump(preprocessor_data, MODELS_DIR / "preprocessor.pkl")

print(f"Model saved to {MODELS_DIR}")