In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import warnings
warnings.filterwarnings('ignore')




df = pd.read_csv('hr_analytics_data.csv')

print("Dataset Shape:", df.shape)
print("\nBasic Statistics:")
print(df.describe())




Dataset Shape: (1470, 20)

Basic Statistics:
        EmployeeID          Age  MonthlyIncome  YearsAtCompany  YearsInRole  \
count  1470.000000  1470.000000    1470.000000     1470.000000  1470.000000   
mean    735.500000    41.208163   10835.821769       19.257823     9.424490   
std     424.496761    13.492208    5374.944376       11.485015     5.703898   
min       1.000000    18.000000    1516.000000        0.000000     0.000000   
25%     368.250000    29.000000    6169.750000        9.250000     5.000000   
50%     735.500000    42.000000   10960.500000       19.000000     9.000000   
75%    1102.750000    53.000000   15507.000000       29.000000    14.000000   
max    1470.000000    64.000000   19989.000000       39.000000    19.000000   

       MonthsInCurrentRole  PercentSalaryHike  NumCompaniesWorked  \
count          1470.000000        1470.000000         1470.000000   
mean             29.006803          17.165986            4.547619   
std              17.232679          

In [4]:
# Department-wise Attrition
dept_analysis = df.groupby('Department').agg({
    'EmployeeID': 'count',
    'Attrition': ['sum', 'mean']
}).round(4)
print("\nDepartment-wise Attrition:")
print(dept_analysis)

# Job Satisfaction Analysis
satisfaction_analysis = df.groupby('JobSatisfaction').agg({
    'EmployeeID': 'count',
    'Attrition': ['sum', 'mean']
}).round(4)
print("\nJob Satisfaction Analysis:")
print(satisfaction_analysis)

# Correlation Analysis
numerical_cols = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'JobSatisfaction', 
                  'EnvironmentSatisfaction', 'WorkLifeBalance', 'DistanceFromHome']
correlations = df[numerical_cols + ['Attrition']].corr()['Attrition'].sort_values()
print("\nAttrition Correlations:")
print(correlations)


Department-wise Attrition:
           EmployeeID Attrition        
                count       sum    mean
Department                             
Finance           318        60  0.1887
HR                264        41  0.1553
IT                303        46  0.1518
Operations        303        43  0.1419
Sales             282        45  0.1596

Job Satisfaction Analysis:
                EmployeeID Attrition        
                     count       sum    mean
JobSatisfaction                             
1                      342        58  0.1696
2                      383        73  0.1906
3                      368        55  0.1495
4                      377        49  0.1300

Attrition Correlations:
EnvironmentSatisfaction   -0.064084
WorkLifeBalance           -0.055028
JobSatisfaction           -0.049754
MonthlyIncome              0.003756
Age                        0.010471
YearsAtCompany             0.015588
DistanceFromHome           0.028999
Attrition                  1.000

In [5]:
# Create a copy for modeling
df_model = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Department', 'JobRole', 'Gender', 'MaritalStatus', 'EducationField']

for col in categorical_cols:
    le = LabelEncoder()
    df_model[col + '_Encoded'] = le.fit_transform(df_model[col])
    label_encoders[col] = le

# Select features
feature_cols = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'YearsInRole', 
                'MonthsInCurrentRole', 'PercentSalaryHike', 'NumCompaniesWorked',
                'DistanceFromHome', 'JobSatisfaction', 'EnvironmentSatisfaction',
                'WorkLifeBalance', 'TrainingTimesLastYear', 'PromotionInLast5Years',
                'Department_Encoded', 'JobRole_Encoded', 'Gender_Encoded', 
                'MaritalStatus_Encoded', 'EducationField_Encoded']

X = df_model[feature_cols]
y = df_model['Attrition']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                      random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



Training set size: (1176, 18)
Test set size: (294, 18)


In [6]:
# Model 1: Logistic Regression
print("\n" + "="*80)
print("MODEL 1: LOGISTIC REGRESSION")
print("="*80)

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

lr_accuracy = (y_pred_lr == y_test).mean()
lr_auc = roc_auc_score(y_test, y_pred_proba_lr)

print(f"Accuracy: {lr_accuracy:.4f}")
print(f"AUC-ROC: {lr_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['No Attrition', 'Attrition']))

# Model 2: Decision Tree
print("\n" + "="*80)
print("MODEL 2: DECISION TREE")
print("="*80)

dt_model = DecisionTreeClassifier(max_depth=10, random_state=42, min_samples_split=20)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
y_pred_proba_dt = dt_model.predict_proba(X_test)[:, 1]

dt_accuracy = (y_pred_dt == y_test).mean()
dt_auc = roc_auc_score(y_test, y_pred_proba_dt)

print(f"Accuracy: {dt_accuracy:.4f}")
print(f"AUC-ROC: {dt_auc:.4f}")

# Model 3: Random Forest
print("\n" + "="*80)
print("MODEL 3: RANDOM FOREST")
print("="*80)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, 
                                  random_state=42, min_samples_split=20, n_jobs=-1)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

rf_accuracy = (y_pred_rf == y_test).mean()
rf_auc = roc_auc_score(y_test, y_pred_proba_rf)

print(f"Accuracy: {rf_accuracy:.4f}")
print(f"AUC-ROC: {rf_auc:.4f}")


MODEL 1: LOGISTIC REGRESSION
Accuracy: 0.8401
AUC-ROC: 0.5435

Confusion Matrix:
[[247   0]
 [ 47   0]]

Classification Report:
              precision    recall  f1-score   support

No Attrition       0.84      1.00      0.91       247
   Attrition       0.00      0.00      0.00        47

    accuracy                           0.84       294
   macro avg       0.42      0.50      0.46       294
weighted avg       0.71      0.84      0.77       294


MODEL 2: DECISION TREE
Accuracy: 0.7891
AUC-ROC: 0.4428

MODEL 3: RANDOM FOREST
Accuracy: 0.8401
AUC-ROC: 0.4644


In [10]:
print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

# Random Forest Feature Importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 15 Most Important Features (Random Forest):")
print(feature_importance.head(15))




FEATURE IMPORTANCE ANALYSIS

Top 15 Most Important Features (Random Forest):
                    Feature  Importance
1             MonthlyIncome    0.144761
4       MonthsInCurrentRole    0.103295
0                       Age    0.099148
2            YearsAtCompany    0.095428
7          DistanceFromHome    0.082084
3               YearsInRole    0.079674
5         PercentSalaryHike    0.070002
6        NumCompaniesWorked    0.053310
13       Department_Encoded    0.039847
9   EnvironmentSatisfaction    0.038142
11    TrainingTimesLastYear    0.034358
10          WorkLifeBalance    0.034206
17   EducationField_Encoded    0.030957
14          JobRole_Encoded    0.027402
8           JobSatisfaction    0.024100


In [8]:
predictions_df = pd.DataFrame({
    'EmployeeID': X_test.index,
    'Department': df.loc[X_test.index, 'Department'].values,
    'JobRole': df.loc[X_test.index, 'JobRole'].values,
    'Age': X_test['Age'].values,
    'MonthlyIncome': X_test['MonthlyIncome'].values,
    'YearsAtCompany': X_test['YearsAtCompany'].values,
    'Predicted_Probability': y_pred_proba_rf,
    'Predicted_Attrition': y_pred_rf,
    'Actual_Attrition': y_test.values
})

predictions_df['Risk_Level'] = predictions_df['Predicted_Probability'].apply(
    lambda x: 'High' if x >= 0.7 else ('Medium' if x >= 0.4 else 'Low')
)

predictions_df.to_csv('model_predictions.csv', index=False)
print("\n✓ Predictions saved to 'model_predictions.csv'")


✓ Predictions saved to 'model_predictions.csv'


In [9]:

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)

comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Accuracy': [lr_accuracy, dt_accuracy, rf_accuracy],
    'AUC-ROC': [lr_auc, dt_auc, rf_auc]
})

print(comparison)
print(f"\n✓ Best Model: Random Forest with {rf_accuracy:.4f} accuracy")


MODEL COMPARISON
                 Model  Accuracy   AUC-ROC
0  Logistic Regression  0.840136  0.543544
1        Decision Tree  0.789116  0.442846
2        Random Forest  0.840136  0.464381

✓ Best Model: Random Forest with 0.8401 accuracy
