In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
HR = pd.read_csv(r'C:\Users\ENVY\OneDrive\Documents\HREmployee.csv')

In [4]:
# Explore the dataset
print(HR.head())
print(HR.info())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [7]:
HR[list(HR.columns)].head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [6]:
HR.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [None]:
# Define numerical and categorical features
cols_num = ['DistanceFromHome', 'EmployeeCount', 'EmployeeNumber', 'HourlyRate','MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
    'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']
cols_cat = ['Age','Attrition', 'BusinessTravel','Department','EducationField','EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus',]

# Visualize numerical features
sns.pairplot(HR[cols_num])
plt.show()

In [None]:
# Visualize categorical features
for col in cols_cat:
    plt.figure(figsize=(15, 6))
    sns.countplot(x=col, data=HR, hue='subscribed')
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Preprocess categorical features using one-hot encoding
cols_new_cat = pd.get_dummies(HR[cols_cat], drop_first=False)
HRDATA = pd.concat([Bankdata, cols_new_cat], axis=1)
HRDATA.drop(cols_cat, axis=1, inplace=True)

In [192]:
# Define input columns
cols_input = cols_num + list(cols_new_cat.columns)

In [None]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(HRDATA[cols_input], HRDATA['subscribed'],
                                                      test_size=0.2, random_state=42)

In [194]:
# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[cols_num])
X_valid_scaled = scaler.transform(X_valid[cols_num])

In [195]:
# Combine scaled numerical features with one-hot encoded categorical features
X_train_processed = np.concatenate([X_train_scaled, X_train[cols_new_cat.columns].values], axis=1)
X_valid_processed = np.concatenate([X_valid_scaled, X_valid[cols_new_cat.columns].values], axis=1)

In [None]:
# Train a K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_processed, y_train)

In [197]:
def calc_specificity(y_actual, y_pred, thresh):
    # Calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) / sum(y_actual == 0)

def print_report(y_actual, y_pred, thresh):
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    f1 = 2 * (precision * recall) / (precision + recall)
    
    print('AUC:%.3f' % auc)
    print('Accuracy:%.3f' % accuracy)
    print('Recall:%.3f' % recall)
    print('Precision:%.3f' % precision)
    print('Specificity:%.3f' % specificity)
    print('Prevalence:%.3f' % np.mean(y_actual))
    print('F1 Score:%.3f' % f1)
    print('')

# Assuming 'yes' corresponds to True and 'no' corresponds to False
y_train_bool = (y_train == 'yes')
y_valid_bool = (y_valid == 'yes')

# Create and fit the StandardScaler on the training data
scaler = StandardScaler()
scaler.fit(X_train)

# Transform the training and validation sets
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)

# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_tf, y_train_bool)

# Evaluate the KNN model on training and validation sets
print('K-Nearest Neighbors Model Evaluation:')
print('Training:')
y_train_preds_knn = knn_model.predict_proba(X_train_tf)[:, 1]
print_report(y_train_bool, y_train_preds_knn, 0.5)

print('Validation:')
y_valid_preds_knn = knn_model.predict_proba(X_valid_tf)[:, 1]
print_report(y_valid_bool, y_valid_preds_knn, 0.5)

K-Nearest Neighbors Model Evaluation:
Training:
AUC:0.791
Accuracy:0.891
Recall:0.169
Precision:0.646
Specificity:0.987
Prevalence:0.118
F1 Score:0.267

Validation:
AUC:0.767
Accuracy:0.892
Recall:0.166
Precision:0.617
Specificity:0.986
Prevalence:0.115
F1 Score:0.261



In [198]:
# Train a Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_processed, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
