In [2]:
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.feature_selection import SelectKBest 
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from category_encoders import OrdinalEncoder

In [3]:
df = pd.read_csv('../data/clean_HR_Analytics.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                 

In [77]:
# vertical split 
target = "Attrition"
X = df.drop(columns=target)
y = df[target]

# split the datasets into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Ordinal encoding
ordinal = OrdinalEncoder()
X_train_encode = ordinal.fit_transform(X_train)

# Label Encoding 
label = LabelEncoder()
y_train_encode = label.fit_transform(y_train)

In [91]:
df.select_dtypes("int")

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,2,94,3,2,4,...,3,1,0,8,0,1,6,4,0,5
1,49,279,8,1,2,3,61,2,2,2,...,4,4,1,10,3,3,10,7,1,7
2,37,1373,2,2,4,4,92,2,1,3,...,3,2,0,7,3,3,0,0,0,0
3,33,1392,3,4,5,4,56,3,1,3,...,3,3,0,8,3,3,8,7,3,0
4,27,591,2,1,7,1,40,3,1,2,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,2061,3,41,4,2,4,...,3,3,1,17,3,3,5,2,0,3
1466,39,613,6,1,2062,4,42,2,3,1,...,3,1,1,9,5,3,7,7,1,7
1467,27,155,4,3,2064,2,87,4,2,2,...,4,2,1,6,0,3,6,2,0,3
1468,49,1023,2,3,2065,4,63,2,2,2,...,3,4,0,17,3,2,9,6,0,8


In [79]:
# normalized the training set
scaler = MinMaxScaler()
# scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encode)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_encode.columns)

In [80]:
# oversampling the datasets
smote = SMOTE()
X_train_over, y_train_over = smote.fit_resample(X_train_scaled, y_train_encode)

In [82]:
clf = RandomForestClassifier(random_state=42)

In [83]:
params = {
    "n_estimators": range(50, 130, 5),
    "max_depth": range(10, 50, 5)
}

In [84]:
model = GridSearchCV(
    clf,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(10, 50, 5),
                         'n_estimators': range(50, 130, 5)},
             verbose=1)

In [85]:
# Train model
model.fit(X_train_over, y_train_over)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(10, 50, 5),
                         'n_estimators': range(50, 130, 5)},
             verbose=1)

In [86]:
# Extract the best hyperparameters
model.best_params_

{'max_depth': 15, 'n_estimators': 120}

In [87]:
# Ordinal encoding
ordinal = OrdinalEncoder()
X_test_encode = ordinal.fit_transform(X_test)

# Label Encoding 
label = LabelEncoder()
y_test_encode = label.fit_transform(y_test)


# normalized the training set
scaler = MinMaxScaler()
# scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test_encode)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_encode.columns)

In [88]:
train_acc = model.score(X_train_over, y_train_over)
test_acc = model.score(X_test_scaled, y_test_encode)

print("Training Accuracy: ", round(train_acc, 2))
print("Test Accuracy: ", round(test_acc, 2))

Training Accuracy:  1.0
Test Accuracy:  0.86
