## Simple Data Science Case: HR Analytics, Chrun Rate
https://www.kaggle.com/datasets/vjchoudhary7/hr-analytics-case-study/code

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
os.listdir("../data/HR_Analytics")

## Load and Join data

In [None]:
df_emplo = pd.read_csv('../data/HR_Analytics/employee_survey_data.csv', index_col='EmployeeID')
df_emplo.head()

In [None]:
df_gen = pd.read_csv('../data/HR_Analytics/general_data.csv', index_col='EmployeeID')
df_gen.head()

In [None]:
df_gen.columns

In [None]:
df_manag = pd.read_csv('../data/HR_Analytics/manager_survey_data.csv', index_col='EmployeeID')
df_manag.head()

In [None]:
df = pd.merge(left=df_gen, right=df_emplo, on='EmployeeID', how='left')
df

In [None]:
df = pd.merge(left=df, right=df_manag, on='EmployeeID', how='left')
df

## EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.isna().sum()

In [None]:
# How many of those with missing data are employees with positive attrition (i.e. they leave)?
df[df.isna().any(axis=1)]
sns.countplot(x='Attrition', data=df[df.isna().any(axis=1)])

In [None]:
df = df.dropna() 

In [None]:
df.reset_index().duplicated().sum()

In [None]:
plt.figure(figsize=(12,9))
sns.heatmap(df.drop(['EmployeeCount','StandardHours'], axis=1).corr(), annot=False, cmap='coolwarm')
# exclude total working years, years at company, and percent salary hike due to possible multicollinearity

In [None]:
df = df.drop(['PercentSalaryHike','TotalWorkingYears', 'YearsAtCompany'], axis=1)

In [None]:
df.columns

In [None]:
#sns.jointplot(x='DistanceFromHome',y='MonthlyIncome', data=df, hue='Attrition')
#sns.jointplot(x='YearsSinceLastPromotion',y='MonthlyIncome', data=df, hue='Attrition')
#sns.jointplot(x='YearsSinceLastPromotion',y='YearsWithCurrManager', data=df, hue='Attrition')

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12,4))
#sns.countplot(x='Attrition', data=df, hue='BusinessTravel')
#sns.countplot(x='Attrition', data=df, hue='Department')
#sns.countplot(x='DistanceFromHome', data=df, hue='Attrition')
#sns.countplot(x='Attrition', data=df, hue='Education')
#sns.countplot(x='Attrition', data=df, hue='EducationField')
#sns.countplot(x='Attrition', data=df, hue='Gender')
#sns.countplot(x='Attrition', data=df, hue='WorkLifeBalance')
#sns.countplot(x='Attrition', data=df, hue='JobSatisfaction') # First one with insights lol
#sns.countplot(x='Attrition', data=df, hue='JobInvolvement') 
#sns.countplot(x='Attrition', data=df, hue='PerformanceRating') 

# From a pure qualitative analysis, I cannot see any striking patterns tbh..

## Logistic Regression for Binary Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [None]:
df.columns

In [None]:
enc = OneHotEncoder(drop='first', sparse=False)

In [None]:
encoded_data = enc.fit_transform(df[['Attrition','Department','EducationField','Gender','JobRole','MaritalStatus','BusinessTravel']])

In [None]:
encoded_df = pd.DataFrame(encoded_data, columns=enc.get_feature_names_out(['Attrition','Department','EducationField','Gender','JobRole','MaritalStatus','BusinessTravel']))

In [None]:
encoded_df

In [None]:
df = df.drop(['Attrition','Department','EducationField','Gender','JobRole','MaritalStatus','BusinessTravel'], axis=1)

In [None]:
df = pd.concat((df.reset_index(),encoded_df), axis=1)

In [None]:
df.columns

In [None]:
X = df.drop(['EmployeeID','EmployeeCount','Over18','StandardHours','Attrition_Yes'], axis=1)
y = df['Attrition_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

In [None]:
m = LogisticRegression()
m.fit(X_train, y_train)
preds = m.predict(X_test)

In [None]:
preds

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
print(classification_report(y_test, preds))

In [None]:
train_pred = m.predict(X_train)
train_accuracy = accuracy_score(y_train, train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print('no overfit at least')

The model is good at predicting class 0.0 with a high recall of 98%. 

However, it struggles with class 1.0 by achieving only 14% recall.

This indicates that while the model makes few errors in predicting 0.0, it misses a large portion of the 1.0 class, only catching 14% of them.

The model's precision for class 1.0 is 60%. So, when it predicts a sample as 1.0, it's correct 60% of the time.

Given the considerable difference in recall between the two classes and the large support difference, the data might be imbalanced, with a higher representation of class 0.0.

Improvements might be needed, especially if correctly identifying class 1.0 is crucial. Techniques like resampling, using different algorithms, or applying weighted loss functions can be explored.


## Try Random Forest

In [None]:
df.columns

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
preds_rfc = rfc.predict(X_test)

In [None]:
preds_rfc

In [None]:
accuracy = accuracy_score(y_test,preds_rfc)
accuracy

In [None]:
print(classification_report(y_test, preds_rfc))

In [None]:
print('This looks much much better!')

In [None]:
train_pred = rfc.predict(X_train)
train_accuracy = accuracy_score(y_train, train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')