In [17]:
import numpy as np
import pandas as pd

In [18]:
df = pd.read_csv("../inputs/general_data.csv")
df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [19]:
ncw = df[df.NumCompaniesWorked != None]['NumCompaniesWorked'].median()
twy = df[df.TotalWorkingYears != None]['TotalWorkingYears'].median()
df_clean = df.fillna({'NumCompaniesWorked': ncw, 'TotalWorkingYears': twy})

With clean dataframe let's transform Attrition to numeric value (1 and 0) and drop original column from dataframe. There are also other columns not having prediction value so we can drop them right away.

In [20]:
df_clean['AttritionNum'] = df_clean.Attrition.astype('category').cat.codes
df_clean.drop(['Attrition', 'EmployeeCount', 'Over18', 'StandardHours', 'EmployeeID'], inplace=True, axis=1)


In [21]:
df_preprocessed = df_clean[df_clean.columns.difference(['AttritionNum'])]
df_preprocessed.head()


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,Married,131160,1.0,11,0,1.0,6,1,0,0
1,31,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,Single,41890,0.0,23,1,6.0,3,5,1,4
2,32,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,Married,193280,1.0,15,3,5.0,2,5,0,3
3,38,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,Married,83210,3.0,11,3,13.0,5,8,7,5
4,32,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,Single,23420,4.0,12,2,9.0,2,6,0,4


In [22]:
num_cols = list(df_preprocessed.dtypes[df_preprocessed.dtypes != 'object'].index.values)
cat_cols = list(df_preprocessed.dtypes[df_preprocessed.dtypes == 'object'].index.values)

In [23]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()

X = df_preprocessed[num_cols]

scaled_values = scaler.fit_transform(X)
X_scaled = pd.DataFrame(scaled_values, index=X.index, columns=X.columns)


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, df_clean['AttritionNum'], random_state=42, test_size=0.2)

model = KNeighborsClassifier(n_neighbors=3)

KNN_Model = model.fit(X_train, y_train)

In [37]:
import pickle

Pkl_Filename = "../model.pkl"

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(KNN_Model, file)


In [26]:
df_preprocessed[num_cols].loc[6]

Age                           28.0
DistanceFromHome              11.0
Education                      2.0
JobLevel                       2.0
MonthlyIncome              58130.0
NumCompaniesWorked             2.0
PercentSalaryHike             20.0
StockOptionLevel               1.0
TotalWorkingYears              5.0
TrainingTimesLastYear          2.0
YearsAtCompany                 0.0
YearsSinceLastPromotion        0.0
YearsWithCurrManager           0.0
Name: 6, dtype: float64

In [27]:
df_clean['AttritionNum'].loc[6]

1

In [29]:
df_preprocessed[num_cols].loc[1].to_json('../outputs/inference_sample.json')
df_preprocessed[num_cols].loc[6].to_json('../outputs/inference_sample_2.json')


In [36]:
df[num_cols].loc[65].to_json()

'{"Age":28.0,"DistanceFromHome":9.0,"Education":4.0,"JobLevel":1.0,"MonthlyIncome":56730.0,"NumCompaniesWorked":5.0,"PercentSalaryHike":14.0,"StockOptionLevel":1.0,"TotalWorkingYears":5.0,"TrainingTimesLastYear":4.0,"YearsAtCompany":3.0,"YearsSinceLastPromotion":2.0,"YearsWithCurrManager":2.0}'