In [1]:
# importing important Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# loading dataset

df = pd.read_csv('HR_comma_sep.csv')
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low


In [3]:
# checking null value in dataset

df.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

In [4]:
# checking how many people left organization

a=df[df.left==1]
a.shape

(3571, 10)

In [5]:
# checking how many people still in organization

b =df[df.left==0]
b.shape

(11428, 10)

In [6]:
# creating a new dataset and taking important columns there

new_data = df[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','salary']]
new_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary
0,0.38,0.53,2,157,3,0,0,low
1,0.8,0.86,5,262,6,0,0,medium
2,0.11,0.88,7,272,4,0,0,medium
3,0.72,0.87,5,223,5,0,0,low
4,0.37,0.52,2,159,3,0,0,low


In [7]:
# converting salary columns in binary form as it's already in text data

salary_dummies = pd.get_dummies(new_data.salary, prefix='salary_data')
df_with_dummies = pd.concat([new_data, salary_dummies], axis = 'columns')
df_with_dummies

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,salary_data_high,salary_data_low,salary_data_medium
0,0.38,0.53,2,157,3,0,0,low,0,1,0
1,0.80,0.86,5,262,6,0,0,medium,0,0,1
2,0.11,0.88,7,272,4,0,0,medium,0,0,1
3,0.72,0.87,5,223,5,0,0,low,0,1,0
4,0.37,0.52,2,159,3,0,0,low,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,low,0,1,0
14995,0.37,0.48,2,160,3,0,0,low,0,1,0
14996,0.37,0.53,2,143,3,0,0,low,0,1,0
14997,0.11,0.96,6,280,4,0,0,low,0,1,0


In [8]:
# dropping salary column which was in text data as we already created binary column for salary.

df_with_dummies.drop('salary', axis='columns', inplace=True)
df_with_dummies

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_data_high,salary_data_low,salary_data_medium
0,0.38,0.53,2,157,3,0,0,0,1,0
1,0.80,0.86,5,262,6,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,0,1,0
14995,0.37,0.48,2,160,3,0,0,0,1,0
14996,0.37,0.53,2,143,3,0,0,0,1,0
14997,0.11,0.96,6,280,4,0,0,0,1,0


In [9]:
# separating X (features) and y (labels). Also printing features.

X = df_with_dummies
X

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_data_high,salary_data_low,salary_data_medium
0,0.38,0.53,2,157,3,0,0,0,1,0
1,0.80,0.86,5,262,6,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,0,1,0
14995,0.37,0.48,2,160,3,0,0,0,1,0
14996,0.37,0.53,2,143,3,0,0,0,1,0
14997,0.11,0.96,6,280,4,0,0,0,1,0


In [10]:
# separating y (labels) and printing that

y = df['left']
y

0        1
1        1
2        1
3        1
4        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: left, Length: 14999, dtype: int64

In [11]:
# importing 'test_train_split' from 'sklean.import' library and spliting training and test data on it

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# importing 'StandardScaler' library then doing feature scaling for better accuracy.

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [13]:
# importing 'LogisticRegression' from sklearn and creating a model building.

from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

In [14]:
# fitting model with training data

reg.fit(X_train, y_train)

LogisticRegression()

In [15]:
# Predicting the Model

reg.predict(X_test)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [16]:
# checking the accuracy score there

reg.score(X_test, y_test)

0.7856666666666666