# Classification using Naive Bayes

### 1 - Import necessary libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 2 - Import dataset

In [2]:
ds = pd.read_csv('employees_data.csv')

### 3 - Allocate ‘Age’, ‘BusinessTravel’, ‘MonthlyIncome’ and ‘JobSatisfaction’ attributes as input and 'Attrition' as output

In [3]:
x = ds.iloc[:, [0, 1, 2, 3]].values  #Age, BusinessTravel, MonthlyIncome, JobSatisfaction
y = ds.iloc[:, 23].values  #Attrition

In [4]:
x

array([[41, 'Travel_Rarely', 5993, 4],
       [49, 'Travel_Frequently', 5130, 2],
       [37, 'Travel_Rarely', 2090, 3],
       ...,
       [27, 'Travel_Rarely', 6142, 2],
       [49, 'Travel_Frequently', 5390, 2],
       [34, 'Travel_Rarely', 4404, 3]], dtype=object)

In [5]:
y

array(['Yes', 'No', 'Yes', ..., 'No', 'No', 'No'], dtype=object)

### 4 - Use LabelEncoder to encode categorical data

In [6]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
x[:, 1] = labelencoder.fit_transform(x[:, 1])  #Apply label encoder to BusinessTravel column

In [7]:
x

array([[41, 2, 5993, 4],
       [49, 1, 5130, 2],
       [37, 2, 2090, 3],
       ...,
       [27, 2, 6142, 2],
       [49, 1, 5390, 2],
       [34, 2, 4404, 3]], dtype=object)

### 5 - Split the data into training and test sets with the appropriate proportions

In [8]:
# Use 80:20 as train:test proportion

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)   

In [9]:
x_train.shape, x_test.shape , y_train.shape, y_test.shape

((1176, 4), (294, 4), (1176,), (294,))

### 6 - Normalize the data using StandardScaler

In [10]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)   

# For x_test, no need another fit, just transform. 
# Since fit was already applied for the first time to x_train

In [11]:
x_train

array([[ 2.3389367 ,  0.595307  ,  2.41725694,  1.14972558],
       [ 0.9043263 ,  0.595307  , -0.91612115,  1.14972558],
       [ 0.35255307,  0.595307  ,  0.41020041, -1.57257768],
       ...,
       [ 0.68361701,  0.595307  ,  0.29395671,  1.14972558],
       [ 0.13184377,  0.595307  , -0.72026428,  0.24229116],
       [ 0.35255307,  0.595307  ,  0.68736435, -0.66514326]])

In [12]:
x_test

array([[-0.08886552, -2.37113807,  0.74559207,  1.14972558],
       [-0.41992946,  0.595307  , -0.45284024, -0.66514326],
       [-0.19922017, -0.88791553, -0.39058952,  0.24229116],
       ...,
       [-0.64063875,  0.595307  , -0.06070303,  0.24229116],
       [ 0.13184377,  0.595307  ,  0.01996883,  0.24229116],
       [ 0.35255307,  0.595307  ,  0.01213456, -1.57257768]])

### 7 - Fit and predict results using the Naïve Bayes classifier

In [13]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [14]:
y_pred

array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 

### 8 - Evaluate the results using confusion matrix and calculate the prediction accuracy

In [15]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[241,   4],
       [ 48,   1]], dtype=int64)

In [16]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8231292517006803

### 9 - Briefly discuss the results and findings

Discussion:

1 - From the Confusion Matrix obtained in Section B8, it is observed that TP = 241, FP = 48, TN = 1, and FN = 4. This leads to an accuracy of 82%, which is quite decent.

2 - Note that smoothing technique was not applied throughout the whole process, which means that IF categorical variable has a category in test data set but not observed in training data set, then the set up model will not be able to make a prediction. This case is known as "Zero Frequency".

3 - Another thing to note is that Naive Bayes assumes that predictors are independent. However, it cannot be guaranteed that "Age", "BusinessTravel", "MonthlyIncome", and "JobSatisfaction" are completely independent. If they are indeed not, then the predictions made by the set up model are arguable.