In [1]:
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
import pickle
import warnings
warnings.filterwarnings('ignore')

In [18]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score ,classification_report

In [3]:
#Importing the dataset
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head(7)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0


In [None]:
#Shape of the dataframe
df.shape

In [None]:
#Datatypes of the attributes 
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df['Age'].describe()[1]

In [4]:
df['Attrition_result']=df["Attrition"]
df=df.drop('Attrition',axis=1)

In [None]:
#Data distribution analysis of Attrition_result
df['Attrition_result'].value_counts()

In [None]:
#Data distribution visualization
sns.countplot(df['Attrition_result'])

In [None]:
plt.subplots(figsize=(12,4))
sns.countplot(x='Age',hue='Attrition_result',data=df,palette='colorblind');

In [None]:
for column in df.columns:
    if df[column].dtype ==object:
        print(str(column)+' : '+str(df[column].unique())+' : '+str(len(df[column].unique())))
        print(df[column].value_counts())
        sns.countplot(df[column])
        plt.show()
        print("------------------------------------------------------------------------")

In [None]:
print(len(df['EmployeeNumber'].unique()))

In [None]:
print(len(df['StandardHours'].unique()))

In [None]:
df['EmployeeCount'].unique()

In [5]:
#Dropping the unnecessary attributes
df = df.drop('Over18',axis =1)
df = df.drop('EmployeeNumber',axis=1)
df = df.drop('StandardHours',axis=1)
df = df.drop('EmployeeCount',axis=1)

In [None]:
#Visualization of corelation between attribute in the dataset through heatmaps
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,fmt='.0%')

In [6]:
#df = df.drop('PerformanceRating')
df = df.drop('BusinessTravel',axis=1)

In [7]:
#Data Transformation
#converting string into  binary number format using Label Encoder
from sklearn.preprocessing import LabelEncoder
for c in df.columns:
    if df[c].dtype == np.number:
        continue
    df[c] = LabelEncoder().fit_transform(df[c])

In [None]:
#sns.pairplot(df,hue="Attrition_result")

In [None]:
#print(df)

In [8]:
#splitting the data into dependent and independent variables 
x=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [9]:
smote =SMOTE(sampling_strategy='minority')
x_sm,y_sm = smote.fit_resample(x,y)

In [10]:
#Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size = 0.25, random_state = 0,stratify=y_sm)

In [None]:
#visualizing the data to find out outliers
pos=1
fig = plt.figure(figsize=(20,90))
for i,var in enumerate(df):
    ax = fig.add_subplot(18,3,pos)
    pos+=1
    sns.boxplot(df[var],ax=ax)

In [None]:
#visualizing the data to find out the skewness of the attributes
pos = 1
fig = plt.figure(figsize=(20,75))
for i,var in enumerate(df):
    ax = fig.add_subplot(18,3,pos)
    pos = pos + 1
    sns.distplot(df[var],ax=ax)

In [11]:
#Feature Scaling
#standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [12]:
print(x_train)

[[ 1.04310457 -1.52436168 -2.35510927 ... -1.04057563 -0.66083046
  -0.13006755]
 [-0.82781731 -1.66606851 -0.39357907 ... -0.44802117  0.64843879
   0.4674353 ]
 [-1.26803422 -0.46358486  1.56795112 ... -1.04057563 -0.66083046
  -1.02632182]
 ...
 [ 0.60288766 -0.32187803  1.56795112 ...  1.03336499 -0.66083046
   1.06493814]
 [-0.93787154  1.65796878 -0.39357907 ... -0.7442984  -0.66083046
  -0.72757039]
 [ 0.3827792   1.40694526  1.56795112 ...  2.21847392  0.9757561
   1.66244099]]


In [13]:
print(y_train)

[0 1 1 ... 0 1 0]


In [None]:
#Logistic_regression
c_lr = LogisticRegression(random_state = 0)
c_lr.fit(x_train, y_train)
y_lr_pred = c_lr.predict(x_test)
cm = confusion_matrix(y_test, y_lr_pred)
print(accuracy_score(y_test, y_lr_pred))
print(classification_report(y_test, y_lr_pred))

In [None]:
#KNN
c_kn = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
c_kn.fit(x_train, y_train)
y_k_pred = c_kn.predict(x_test)
cm = confusion_matrix(y_test, y_k_pred)
print(cm)
print(accuracy_score(y_test, y_k_pred))
print(classification_report(y_test, y_k_pred))

In [None]:
#Random forest
forest =RandomForestClassifier(n_estimators=20,criterion='entropy', random_state=0)
forest.fit(x_train,y_train)
forest.score(x_train,y_train)
y_rf_pred=forest.predict(x_test)
cm_rf = confusion_matrix(y_test, y_rf_pred)
print(cm_rf)
print(accuracy_score(y_test, y_rf_pred))
print(classification_report(y_test, y_rf_pred))

In [None]:
#svm
c_svm = SVC(kernel = 'rbf', random_state = 0)
c_svm.fit(x_train, y_train)
y_svm_pred = c_svm.predict(x_test)
cm_svm = confusion_matrix(y_test, y_svm_pred)
print(cm_svm)
print(accuracy_score(y_test, y_svm_pred))
print(classification_report(y_test, y_svm_pred))

In [None]:
#naive bayes
c_nb = GaussianNB()
c_nb.fit(x_train, y_train)
y_nb_pred = c_nb.predict(x_test)
y_nb_pred
cm_nb = confusion_matrix(y_test, y_nb_pred)
print(cm_nb)
print(accuracy_score(y_test, y_nb_pred))
print(classification_report(y_test, y_svm_pred))

In [None]:
#final_result_majority voting
i=0
final = y_svm_pred
while i<len(y_nb_pred):
    s = 0
    s = y_svm_pred[i]+y_rf_pred[i]+y_lr_pred[i]
    if s>=2:
        final[i] = 1
    else:
        final[i] = 0
    i+=1
cm_nb = confusion_matrix(y_test, final)
print(cm_nb)
print(accuracy_score(y_test, final))
print(classification_report(y_test, y_svm_pred))

In [19]:
#final_result_stacking
clf = [
            ('svm',SVC(kernel = 'rbf', random_state = 0)),
            ('rfc',RandomForestClassifier(n_estimators=20,criterion='entropy', random_state=0) ),
            ('c_kn',KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2))
      ] 
lr = LogisticRegression()
stack_model = StackingClassifier( estimators = clf,final_estimator = lr)
stack_model.fit(x_train, y_train)
stack_model_pred = stack_model.predict(x_test)
stack_model_pred
cm_stack_model = confusion_matrix(y_test, stack_model_pred)
print(cm_stack_model)
print(accuracy_score(y_test, stack_model_pred))
print(classification_report(y_test, stack_model_pred))

[[284  25]
 [ 32 276]]
0.9076175040518638
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       309
           1       0.92      0.90      0.91       308

    accuracy                           0.91       617
   macro avg       0.91      0.91      0.91       617
weighted avg       0.91      0.91      0.91       617

