# Important Library, Dataset & Profiling

In [1]:
#importing libraries

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# Setting path for saving images

from pathlib import Path
IMAGES_PATH = Path() / "Diagrams"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Reading Dataset

In [3]:
# Reading the Dataset

df = pd.read_csv("../Dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


### Applying Profiling

In [4]:
 #Applying Profiling for Data Preprocessing
#!pip install ydata-profiling
#!pip install ydata-profiling[notebook]

from ydata_profiling import ProfileReport
prof = ProfileReport(df, title="Employee Attrition Dataset Profile Report")
prof.to_file(output_file='Dataset Profile.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/35 [00:00<?, ?it/s][A
 23%|██████████████████▉                                                                | 8/35 [00:00<00:00, 68.25it/s][A
 46%|█████████████████████████████████████▍                                            | 16/35 [00:00<00:00, 64.88it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 94.62it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Exploratory Data Analysis (EDA) and visualization

### Target Variable Analysis (Attrition) 

In [29]:
sns.countplot(data=df, x='Attrition')
plt.title('Employee Attrition Distribution')
plt.savefig(IMAGES_PATH / 'Target Distribution' / 'attrition_distribution.png')
plt.show()


  plt.show()


### Categorical Feature Distributions 

In [6]:
categorical_features = ['BusinessTravel', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
for feature in categorical_features:
    if feature == 'JobRole':
        plt.figure(figsize=(14,12))  
        plt.xticks(rotation=25)
    else:
        plt.figure(figsize=(10,8))
        plt.xticks(rotation=0)
    sns.countplot(data=df, x=feature, hue='Attrition')
    plt.title(f'{feature} Distribution by Attrition')
    plt.savefig(IMAGES_PATH / 'Catagorical Distribution' / f'{feature}_attrition.png')
    plt.show()


  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()


### Numerical Feature Distributions 

In [7]:
numerical_features = ['Age', 'MonthlyIncome', 'DistanceFromHome']
for feature in numerical_features:
    plt.figure(figsize=(8,4))
    sns.kdeplot(data=df, x=feature, hue='Attrition', fill=True)
    plt.title(f'Distribution of {feature} by Attrition')
    plt.savefig(IMAGES_PATH / 'Numerical Distribution' / f'{feature}_attrition_dist.png')
    plt.show()


  plt.show()
  plt.show()
  plt.show()


### Correlation Heatmap 

In [8]:
plt.figure(figsize=(18,17))
numeric_df = df.select_dtypes(include=['number'])
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig(IMAGES_PATH / 'Correlation Heatmap' / 'correlation_heatmap.png')
plt.show()


  plt.show()


### PairPlot/ ScatterPlot

In [30]:
# Important numeric features based on logistic regression & domain knowledge
important_features = [
    'Age',
    'TotalWorkingYears',
    'NumCompaniesWorked',
    'MonthlyIncome',
    'EnvironmentSatisfaction',
    'JobSatisfaction',
    'YearsAtCompany',
    'WorkLifeBalance',
    'DistanceFromHome'
]

# Convert Attrition to categorical if not already
df['Attrition'] = df['Attrition'].astype('category')

# Generate pairplot for the important features
sns.pairplot(
    df,
    vars=important_features,
    hue='Attrition',
    diag_kind='kde',
    palette='Set2',
    corner=True
)

plt.suptitle('Pairplot of Important Numeric Features by Attrition', y=1.02)
plt.savefig(IMAGES_PATH / 'PairPlot' /'pairplot_important_features.png', dpi=300, bbox_inches='tight')
plt.show()


  plt.show()


# Dataset Preprocessing

## Setting Independant & Dependant Features

In [31]:
# Separating the Independent and Dependent Variables
columns_to_drop = [
    'EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber',
    'JobLevel', 'Department', 'PerformanceRating', 'DailyRate', 'Education',
    'HourlyRate', 'JobInvolvement', 'MonthlyRate', 'PercentSalaryHike',
    'RelationshipSatisfaction', 'TrainingTimesLastYear', 'YearsInCurrentRole',
    'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition'
]
X = df.drop(columns=columns_to_drop)
y = df['Attrition'].map({'Yes': 1, 'No': 0})

In [32]:
X.shape

(1470, 16)

In [33]:
X.head()

Unnamed: 0,Age,BusinessTravel,DistanceFromHome,EducationField,EnvironmentSatisfaction,Gender,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
0,41,Travel_Rarely,1,Life Sciences,2,Female,Sales Executive,4,Single,5993,8,Yes,0,8,1,6
1,49,Travel_Frequently,8,Life Sciences,3,Male,Research Scientist,2,Married,5130,1,No,1,10,3,10
2,37,Travel_Rarely,2,Other,4,Male,Laboratory Technician,3,Single,2090,6,Yes,0,7,3,0
3,33,Travel_Frequently,3,Life Sciences,4,Female,Research Scientist,3,Married,2909,1,Yes,0,8,3,8
4,27,Travel_Rarely,2,Medical,1,Male,Laboratory Technician,2,Married,3468,9,No,1,6,3,2


## Splitting Dataset

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [35]:
X_train.head()

Unnamed: 0,Age,BusinessTravel,DistanceFromHome,EducationField,EnvironmentSatisfaction,Gender,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
1194,47,Travel_Rarely,2,Life Sciences,2,Female,Manager,2,Divorced,15972,6,No,3,29,3,3
128,22,Travel_Rarely,2,Technical Degree,3,Male,Laboratory Technician,4,Married,2523,0,No,1,3,3,2
810,46,Travel_Rarely,3,Marketing,1,Male,Manager,3,Married,17465,3,No,1,23,3,12
478,25,Travel_Rarely,13,Medical,2,Male,Sales Representative,3,Married,2096,1,No,0,7,3,7
491,43,Travel_Frequently,9,Medical,4,Male,Laboratory Technician,3,Divorced,5679,3,Yes,1,10,3,8


In [36]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

## One Hot Embedding for Catagorical attributes

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['BusinessTravel', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' 
)


In [38]:
# Applying the preprocessor to X
X_transformed = preprocessor.fit_transform(X_train)

# Getting encoded column names from OneHotEncoder
encoded_col_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# Getting remaining numerical feature names
numerical_features = [col for col in X_train.columns
                      if col not in categorical_features]

# Combining all column names
all_column_names = list(encoded_col_names) + numerical_features

# Converting the result into a DataFrame with column names
X_train = pd.DataFrame(X_transformed, columns=all_column_names)

# Transforming test set with the same fitted preprocessor
X_test_transformed = preprocessor.transform(X_test)
X_test = pd.DataFrame(X_test_transformed, columns=all_column_names)

X_train.head()

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,JobRole_Human Resources,JobRole_Laboratory Technician,...,Age,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,47.0,2.0,2.0,2.0,15972.0,6.0,3.0,29.0,3.0,3.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,22.0,2.0,3.0,4.0,2523.0,0.0,1.0,3.0,3.0,2.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,46.0,3.0,1.0,3.0,17465.0,3.0,1.0,23.0,3.0,12.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,25.0,13.0,2.0,3.0,2096.0,1.0,0.0,7.0,3.0,7.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,43.0,9.0,4.0,3.0,5679.0,3.0,1.0,10.0,3.0,8.0


In [39]:
X_test.head()

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,JobRole_Human Resources,JobRole_Laboratory Technician,...,Age,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.0,13.0,4.0,2.0,2033.0,1.0,1.0,1.0,3.0,1.0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,44.0,2.0,1.0,4.0,2011.0,1.0,1.0,10.0,3.0,10.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,31.0,7.0,3.0,4.0,11557.0,9.0,1.0,10.0,2.0,5.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,44.0,4.0,3.0,1.0,19190.0,1.0,2.0,26.0,2.0,25.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,36.0,9.0,4.0,3.0,3388.0,0.0,1.0,2.0,2.0,1.0


In [40]:
X_train.columns

Index(['BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_Yes', 'Age',
       'DistanceFromHome', 'EnvironmentSatisfaction', 'JobSatisfaction',
       'MonthlyIncome', 'NumCompaniesWorked', 'StockOptionLevel',
       'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany'],
      dtype='object')

In [41]:
# Checking for Missing Values
X_train.isnull().sum()

BusinessTravel_Travel_Frequently    0
BusinessTravel_Travel_Rarely        0
EducationField_Life Sciences        0
EducationField_Marketing            0
EducationField_Medical              0
EducationField_Other                0
EducationField_Technical Degree     0
Gender_Male                         0
JobRole_Human Resources             0
JobRole_Laboratory Technician       0
JobRole_Manager                     0
JobRole_Manufacturing Director      0
JobRole_Research Director           0
JobRole_Research Scientist          0
JobRole_Sales Executive             0
JobRole_Sales Representative        0
MaritalStatus_Married               0
MaritalStatus_Single                0
OverTime_Yes                        0
Age                                 0
DistanceFromHome                    0
EnvironmentSatisfaction             0
JobSatisfaction                     0
MonthlyIncome                       0
NumCompaniesWorked                  0
StockOptionLevel                    0
TotalWorking

## Feature Scaling

In [42]:
from sklearn.preprocessing import MinMaxScaler
scaleVars = ['Age', 'DistanceFromHome', 'EnvironmentSatisfaction', 'JobSatisfaction', 
             'MonthlyIncome','NumCompaniesWorked', 'StockOptionLevel','TotalWorkingYears',
             'WorkLifeBalance','YearsAtCompany']
mms = MinMaxScaler()
X_train[scaleVars] = mms.fit_transform(X_train[scaleVars])
X_test[scaleVars] = mms.transform(X_test[scaleVars])

In [43]:
X_train.head()

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,JobRole_Human Resources,JobRole_Laboratory Technician,...,Age,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.690476,0.035714,0.333333,0.333333,0.789021,0.666667,1.0,0.725,0.666667,0.081081
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.095238,0.035714,0.666667,1.0,0.079835,0.0,0.333333,0.075,0.666667,0.054054
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.666667,0.071429,0.0,0.666667,0.867749,0.333333,0.333333,0.575,0.666667,0.324324
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.166667,0.428571,0.333333,0.666667,0.057319,0.111111,0.0,0.175,0.666667,0.189189
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.595238,0.285714,1.0,0.666667,0.246256,0.333333,0.333333,0.25,0.666667,0.216216


In [44]:
X_test.head()

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,JobRole_Human Resources,JobRole_Laboratory Technician,...,Age,DistanceFromHome,EnvironmentSatisfaction,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.428571,1.0,0.333333,0.053997,0.111111,0.333333,0.025,0.666667,0.027027
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.619048,0.035714,0.0,1.0,0.052837,0.111111,0.333333,0.25,0.666667,0.27027
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.309524,0.214286,0.666667,1.0,0.556212,1.0,0.333333,0.25,0.333333,0.135135
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.619048,0.107143,0.666667,0.0,0.958711,0.111111,0.666667,0.65,0.333333,0.675676
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.428571,0.285714,1.0,0.666667,0.125448,0.0,0.333333,0.05,0.333333,0.027027


# Finding Important Features Using Random Forest

In [49]:
# Train a Random Forest classifier on your training data
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Get feature importances from the trained model
importances = rf.feature_importances_

# Create a DataFrame to see features and their importance scores
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort features by importance descending
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# Show the top 10 important features
print(feature_importances.head(10))

# Plot feature importances
plt.figure(figsize=(16,6))
plt.barh(feature_importances['feature'][:10][::-1], feature_importances['importance'][:10][::-1], color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Top 10 Important Features from Random Forest')
plt.savefig(IMAGES_PATH /'important_features.png')
plt.show()

                    feature  importance
23            MonthlyIncome    0.121646
19                      Age    0.099001
26        TotalWorkingYears    0.095568
20         DistanceFromHome    0.083496
28           YearsAtCompany    0.081414
24       NumCompaniesWorked    0.058113
18             OverTime_Yes    0.049336
21  EnvironmentSatisfaction    0.048252
27          WorkLifeBalance    0.047471
22          JobSatisfaction    0.043711


  plt.show()


# Classification Report To Find the Best Model

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from xgboost import XGBClassifier

top_features = [
    'MonthlyIncome', 'Age', 'TotalWorkingYears', 'DistanceFromHome', 'YearsAtCompany',
    'NumCompaniesWorked', 'OverTime_Yes', 'EnvironmentSatisfaction', 'WorkLifeBalance', 'JobSatisfaction'
]

# Select only these features for training and testing
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

def evaluate_classifiers(train_X, train_y, test_X, test_y):
    classifiers = {
        'KNN': KNeighborsClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
        'SVC': SVC(random_state=20, probability=True),
        'Random Forest': RandomForestClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=42),
       # 'XGBoost': XGBClassifier()
    }
    
    results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'])
    
    for name, classifier in classifiers.items():
        classifier.fit(train_X, train_y)
        predictions = classifier.predict(test_X)
        probabilities = classifier.predict_proba(test_X)[:,1] if hasattr(classifier, "predict_proba") else preds
        
        accuracy = accuracy_score(test_y, predictions)
        precision = precision_score(test_y, predictions)
        recall = recall_score(test_y, predictions)
        f1 = f1_score(test_y, predictions)
        roc_auc = roc_auc_score(test_y, probabilities)
        
        results.loc[name] = [accuracy, precision, recall, f1, roc_auc]
    
    return results

### Using All the Features for Report 

In [61]:
results_df = evaluate_classifiers(X_train, y_train, X_test, y_test)
print(results_df)

                     Accuracy  Precision    Recall  F1 Score   ROC AUC
KNN                  0.853741   0.700000  0.148936  0.245614  0.665044
Decision Tree        0.802721   0.377778  0.361702  0.369565  0.624171
Logistic Regression  0.867347   0.750000  0.255319  0.380952  0.785081
SVC                  0.857143   0.777778  0.148936  0.250000  0.783788
Random Forest        0.836735   0.461538  0.127660  0.200000  0.793953
AdaBoost             0.819728   0.333333  0.127660  0.184615  0.788483


### Using only the Top Features for Report

In [62]:
results_top_features = evaluate_classifiers(X_train_top, y_train, X_test_top, y_test)
print(results_top_features)

                     Accuracy  Precision    Recall  F1 Score   ROC AUC
KNN                  0.860544   0.687500  0.234043  0.349206  0.711775
Decision Tree        0.772109   0.307692  0.340426  0.323232  0.597338
Logistic Regression  0.853741   0.700000  0.148936  0.245614  0.767508
SVC                  0.850340   0.615385  0.170213  0.266667  0.687915
Random Forest        0.833333   0.450000  0.191489  0.268657  0.773538
AdaBoost             0.809524   0.304348  0.148936  0.200000  0.753596


### Using Top Features Plus Two More Feature Close to Important

In [65]:
enhanced_top_features = [
    'MonthlyIncome', 'Age', 'TotalWorkingYears', 'DistanceFromHome', 'YearsAtCompany',
    'NumCompaniesWorked', 'OverTime_Yes', 'EnvironmentSatisfaction', 'WorkLifeBalance','JobSatisfaction', 
    
    'Gender_Male', 'StockOptionLevel'
]

X_train_tops = X_train[enhanced_top_features]
X_test_tops = X_test[enhanced_top_features]

results_top_features = evaluate_classifiers(X_train_tops, y_train, X_test_tops, y_test)
print(results_top_features)

                     Accuracy  Precision    Recall  F1 Score   ROC AUC
KNN                  0.860544   0.714286  0.212766  0.327869  0.690456
Decision Tree        0.755102   0.272727  0.319149  0.294118  0.578603
Logistic Regression  0.874150   0.916667  0.234043  0.372881  0.789215
SVC                  0.853741   0.700000  0.148936  0.245614  0.785942
Random Forest        0.833333   0.437500  0.148936  0.222222  0.760875
AdaBoost             0.826531   0.423077  0.234043  0.301370  0.787277


### Hghlighting Information for Project Report
<li>Adding features like Gender_Male and StockOptionLevel helped increase classification precision and ROC AUC, indicating better positive class identification.</li>
<li>Logistic Regression stands out as your best-performing model with these features.</li>

# Hyper Parameter Tuning on Logistic Regression,KNN,Random Forest
<li>As They Showed Improvement in Accuracy</li>
<li>Random Forest cause it has improving capability with hyperparameter tuning</li>

# Decision Tree

In [26]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Fit decision tree
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(X_train, y_train)

# Draw and save the decision tree
def draw_decision_tree(decision_tree, feature_names=None, class_names=None):
    plt.figure(figsize=(12, 8))
    plot_tree(
        decision_tree,
        feature_names=feature_names,
        class_names=class_names,
        filled=True,
        rounded=True,
        fontsize=10
    )
    plt.savefig('my_plot.png', dpi=300, bbox_inches="tight")
    plt.show()

# Call the function
feature_names = list(X_train.columns)
draw_decision_tree(dt, feature_names=feature_names, class_names=["No Attrition", "Attrition"])


  plt.show()


## Logistic Regression

In [27]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scaling the data as shown in the Warning
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit logistic regression
log_reg = LogisticRegression(max_iter=2000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Get feature importance (coefficients)
feature_names = list(X_train.columns)
coefficients = log_reg.coef_[0]  # For binary classification

# Sort by absolute value for better visualization
coef_sorted_idx = np.argsort(np.abs(coefficients))[::-1]
sorted_features = [feature_names[i] for i in coef_sorted_idx]
sorted_coefs = coefficients[coef_sorted_idx]

# Plot coefficients
plt.figure(figsize=(12, 8))
plt.barh(sorted_features, sorted_coefs, color='skyblue')
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Logistic Regression Feature Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('log_reg_coefficients.png', dpi=300, bbox_inches="tight")
plt.show()

# Print top coefficients
for feat, coef in zip(sorted_features[:10], sorted_coefs[:10]):
    print(f"{feat}: {coef:.4f}")


OverTime_Yes: 0.7775
BusinessTravel_Travel_Frequently: 0.6983
JobRole_Laboratory Technician: 0.6693
JobRole_Sales Representative: 0.4908
TotalWorkingYears: -0.4807
NumCompaniesWorked: 0.4506
Age: -0.4427
EnvironmentSatisfaction: -0.4357
JobRole_Research Director: -0.4143
JobRole_Sales Executive: 0.4121


  plt.show()


# Shuffle Split/ Repeated Random train-test splits

### Decision Tree

In [28]:
from sklearn.model_selection import ShuffleSplit , cross_val_score
ssplit = ShuffleSplit(n_splits = 35, test_size = 0.20)
clf = DecisionTreeClassifier()
results = cross_val_score(clf, X, y, cv = ssplit)
print(results)
print()
print ("Results = ", np.mean(results), "+/-", np.std(results))

ValueError: 
All the 35 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 1009, in fit
    super()._fit(
  File "E:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 645, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Travel_Rarely'

--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 1009, in fit
    super()._fit(
  File "E:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 645, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Travel_Frequently'

--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 1009, in fit
    super()._fit(
  File "E:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 645, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Non-Travel'


### Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.exceptions import ConvergenceWarning
import warnings

scaler = StandardScaler(with_mean=False)  

# Pipeline: scale -> logistic regression
clf = Pipeline([
    ('scaler', scaler),
    ('log_reg', LogisticRegression(max_iter=5000, random_state=42, solver='lbfgs'))
])

# ShuffleSplit config
ssplit = ShuffleSplit(n_splits=35, test_size=0.20, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    results = cross_val_score(clf, X, y, cv=ssplit)

print(results)
print()
print("Results = ", np.mean(results), "+/-", np.std(results))

# XAI using LIME

In [None]:
!pip install -U lime

In [None]:
import lime
import lime.lime_tabular

In [None]:
X_train.columns

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X_train.values,
    mode='classification',
    feature_names=X_train.columns)

In [None]:
exp = explainer.explain_instance(X_test.iloc[1], dt.predict_proba)
exp.show_in_notebook(show_table = True)

In [None]:
X_test.head()

In [None]:
exp.as_pyplot_figure()

                     Accuracy  Precision    Recall  F1 Score   ROC AUC
KNN                  0.860544   0.714286  0.212766  0.327869  0.690456
Decision Tree        0.755102   0.272727  0.319149  0.294118  0.578603
Logistic Regression  0.874150   0.916667  0.234043  0.372881  0.789215
SVC                  0.853741   0.700000  0.148936  0.245614  0.785942
Random Forest        0.833333   0.437500  0.148936  0.222222  0.760875
AdaBoost             0.826531   0.423077  0.234043  0.301370  0.787277
