# WorkExo Analysis

In [1]:
import pandas as pd
import numpy as np
import pickle as pk

In [2]:
#importing datasets
data = pd.read_csv("./datasets/Workexo.csv", usecols = ['JobInvolvement', 'Department','HourlyRate', 'StandardHours','PerformanceRating'])
data=data.reindex(columns=['Department','JobInvolvement', 'HourlyRate', 'StandardHours','PerformanceRating'])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 5 columns):
Department           1470 non-null object
JobInvolvement       1470 non-null int64
HourlyRate           1470 non-null int64
StandardHours        1470 non-null int64
PerformanceRating    1470 non-null int64
dtypes: int64(4), object(1)
memory usage: 57.5+ KB


In [4]:
data.shape

(1470, 5)

In [5]:
data.head()

Unnamed: 0,Department,JobInvolvement,HourlyRate,StandardHours,PerformanceRating
0,Sales,3,94,80,3
1,Research & Development,2,61,80,4
2,Research & Development,2,92,80,3
3,Research & Development,3,56,80,3
4,Research & Development,3,40,80,3


In [6]:
pd.value_counts(data.Department)

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

In [7]:
#distributing independent and dependent variables
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values


In [8]:
#independent variable
print(X)


[['Sales' 3 94 80]
 ['Research & Development' 2 61 80]
 ['Research & Development' 2 92 80]
 ...
 ['Research & Development' 4 87 80]
 ['Sales' 2 63 80]
 ['Research & Development' 4 82 80]]


In [9]:
#Integer Encoding of Department type
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X[:,0] = label_encoder.fit_transform(X[:,0])
X = X.astype(np.int64)

In [10]:
#Independent variable after Integer Encoding
print(X)

[[ 2  3 94 80]
 [ 1  2 61 80]
 [ 1  2 92 80]
 ...
 [ 1  4 87 80]
 [ 2  2 63 80]
 [ 1  4 82 80]]


In [11]:
#Splitting the dataset into Training and Test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [12]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [13]:
#Predicting the Test Set Results
y_pred = classifier.predict(X_test)

In [14]:
#Accuracy/100
print(classifier.score(X_test, y_test))

0.8641304347826086


In [15]:
# Calculating the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [16]:
print(cm)

[[306   1]
 [ 49  12]]


In [17]:
#Calculating and printing the accuracy
# accuracy = (cm[0][0] + cm [1][1])/(cm[0][0] + cm [0][1]+cm[1][0] + cm [1][1])
# print(accuracy*100,"%")

In [18]:
pk.dump(classifier, open( "model.sav", "wb" ))