In [None]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("./dataset/employee_dataset.csv", encoding = "latin-1")
data.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,52685,36,Male,13,Healthcare,8029,Excellent,High,Average,1,...,1,Mid,Large,22,No,No,No,Poor,Medium,Stayed
1,30585,35,Male,7,Education,4563,Good,High,Average,1,...,4,Entry,Medium,27,No,No,No,Good,High,Left
2,54656,50,Male,7,Education,5583,Fair,High,Average,3,...,2,Senior,Medium,76,No,No,Yes,Good,Low,Stayed
3,33442,58,Male,44,Media,5525,Fair,Very High,High,0,...,4,Entry,Medium,96,No,No,No,Poor,Low,Left
4,15667,39,Male,24,Education,4604,Good,High,Average,0,...,6,Mid,Large,45,Yes,No,No,Good,High,Stayed


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14900 entries, 0 to 14899
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               14900 non-null  int64 
 1   Age                       14900 non-null  int64 
 2   Gender                    14900 non-null  object
 3   Years at Company          14900 non-null  int64 
 4   Job Role                  14900 non-null  object
 5   Monthly Income            14900 non-null  int64 
 6   Work-Life Balance         14900 non-null  object
 7   Job Satisfaction          14900 non-null  object
 8   Performance Rating        14900 non-null  object
 9   Number of Promotions      14900 non-null  int64 
 10  Overtime                  14900 non-null  object
 11  Distance from Home        14900 non-null  int64 
 12  Education Level           14900 non-null  object
 13  Marital Status            14900 non-null  object
 14  Number of Dependents  

In [4]:
data.shape

(14900, 24)

In [5]:
data.isnull()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14896,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14897,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14898,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
data.isnull().sum()

Employee ID                 0
Age                         0
Gender                      0
Years at Company            0
Job Role                    0
Monthly Income              0
Work-Life Balance           0
Job Satisfaction            0
Performance Rating          0
Number of Promotions        0
Overtime                    0
Distance from Home          0
Education Level             0
Marital Status              0
Number of Dependents        0
Job Level                   0
Company Size                0
Company Tenure              0
Remote Work                 0
Leadership Opportunities    0
Innovation Opportunities    0
Company Reputation          0
Employee Recognition        0
Attrition                   0
dtype: int64

In [7]:
data = data.rename(columns={
    "Age": "age",
    "Gender": "gender",
    "Years at Company": "years_at_company", 
    "Job Role": "job_role", 
    "Monthly Income": "monthly_income", 
    "Work-Life Balance": "work_life_balance",
    "Job Satisfaction": "job_satisfaction",
    "Performance Rating": "performance_rating",
    "Number of Promotions": "number_of_promotions",
    "Overtime": "overtime",
    "Distance from Home": "distance_from_home",
    "Education Level": "education_level",
    "Marital Status": "marital_status",
    "Number of Dependents": "number_of_dependents",
    "Job Level": "job_level",
    "Company Size": "company_size",
    "Company Tenure": "company_tenure",
    "Remote Work": "remote_work",
    "Leadership Opportunities": "leadership_opportunities",
    "Innovation Opportunities": "innovation_opportunities",
    "Company Reputation": "company_reputation",
    "Employee Recognition": "employee_recognition",
    "Attrition": "attrition"
})

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14900 entries, 0 to 14899
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               14900 non-null  int64 
 1   age                       14900 non-null  int64 
 2   gender                    14900 non-null  object
 3   years_at_company          14900 non-null  int64 
 4   job_role                  14900 non-null  object
 5   monthly_income            14900 non-null  int64 
 6   work_life_balance         14900 non-null  object
 7   job_satisfaction          14900 non-null  object
 8   performance_rating        14900 non-null  object
 9   number_of_promotions      14900 non-null  int64 
 10  overtime                  14900 non-null  object
 11  distance_from_home        14900 non-null  int64 
 12  education_level           14900 non-null  object
 13  marital_status            14900 non-null  object
 14  number_of_dependents  

<br><br>
## Preprocess

### Age Column

In [9]:
# Did not change age stay default, will perform well
data["age"].value_counts()

age
32    390
40    386
53    386
33    385
56    384
19    379
45    379
35    378
34    374
43    373
29    372
36    370
27    369
21    368
30    368
55    368
57    367
38    364
31    362
22    358
46    358
18    356
51    356
24    352
37    350
58    345
20    344
54    343
39    343
52    342
42    341
26    340
28    340
23    332
41    330
50    326
44    326
25    326
59    325
49    317
48    315
47    313
Name: count, dtype: int64

In [10]:
len(data["age"].value_counts())

42

In [11]:
max(data["age"])

59

In [12]:
min(data["age"])

18

In [13]:
len(data[data["age"] > 50])

3216

<br><br>
### Gender Column

In [14]:
data["gender"].value_counts()

gender
Male      8087
Female    6813
Name: count, dtype: int64

In [15]:
len(data["gender"].value_counts())

2

In [16]:
# Add a new column 'gender_binary' based on the 'gender' column
data['gender_binary'] = data['gender'].map({'Male': 1, 'Female': 0})
data["gender_binary"].value_counts()

gender_binary
1    8087
0    6813
Name: count, dtype: int64

In [17]:
data = data.drop("gender", axis=1)
data.head()

Unnamed: 0,Employee ID,age,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,...,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,attrition,gender_binary
0,52685,36,13,Healthcare,8029,Excellent,High,Average,1,Yes,...,Mid,Large,22,No,No,No,Poor,Medium,Stayed,1
1,30585,35,7,Education,4563,Good,High,Average,1,Yes,...,Entry,Medium,27,No,No,No,Good,High,Left,1
2,54656,50,7,Education,5583,Fair,High,Average,3,Yes,...,Senior,Medium,76,No,No,Yes,Good,Low,Stayed,1
3,33442,58,44,Media,5525,Fair,Very High,High,0,Yes,...,Entry,Medium,96,No,No,No,Poor,Low,Left,1
4,15667,39,24,Education,4604,Good,High,Average,0,Yes,...,Mid,Large,45,Yes,No,No,Good,High,Stayed,1


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14900 entries, 0 to 14899
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               14900 non-null  int64 
 1   age                       14900 non-null  int64 
 2   years_at_company          14900 non-null  int64 
 3   job_role                  14900 non-null  object
 4   monthly_income            14900 non-null  int64 
 5   work_life_balance         14900 non-null  object
 6   job_satisfaction          14900 non-null  object
 7   performance_rating        14900 non-null  object
 8   number_of_promotions      14900 non-null  int64 
 9   overtime                  14900 non-null  object
 10  distance_from_home        14900 non-null  int64 
 11  education_level           14900 non-null  object
 12  marital_status            14900 non-null  object
 13  number_of_dependents      14900 non-null  int64 
 14  job_level             

In [19]:
data.shape

(14900, 24)

<br><br>
### Years at Company Column

In [20]:
data["years_at_company"].value_counts()

years_at_company
1     635
5     628
8     612
6     608
10    607
4     601
7     597
3     595
2     586
11    567
9     565
12    552
13    499
15    444
14    439
17    424
16    419
18    406
20    364
19    351
23    334
21    334
22    322
24    284
26    257
25    250
27    223
28    209
30    205
29    203
31    182
32    166
33    160
36    145
35    130
37    120
34    111
38    106
40     89
41     88
39     86
42     78
43     74
45     61
44     50
47     35
46     34
48     33
49     19
51      7
50      6
Name: count, dtype: int64

In [21]:
len(data["years_at_company"].value_counts())

51

In [22]:
max(data["years_at_company"])

51

In [23]:
min(data["years_at_company"])

1

In [24]:
len(data[data["years_at_company"] > 50])

7

In [25]:
data[data["years_at_company"] > 50].head(7)

Unnamed: 0,Employee ID,age,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,...,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,attrition,gender_binary
2028,24278,59,51,Finance,9088,Fair,High,Average,0,No,...,Mid,Medium,58,No,No,No,Poor,High,Stayed,0
2769,2731,59,51,Education,3825,Poor,High,Average,1,No,...,Entry,Small,115,No,No,No,Excellent,Low,Stayed,1
3929,11731,59,51,Technology,9964,Good,Medium,Average,2,No,...,Mid,Small,65,No,No,No,Good,Low,Left,1
5526,70285,59,51,Technology,8784,Good,Low,Average,1,No,...,Mid,Small,116,No,No,No,Fair,Medium,Stayed,1
7207,72932,59,51,Education,4199,Good,Very High,High,0,No,...,Entry,Small,93,Yes,No,No,Fair,High,Stayed,0
11293,43405,59,51,Media,6035,Excellent,Low,Average,2,No,...,Entry,Medium,70,Yes,No,No,Fair,High,Left,0
14447,47213,59,51,Healthcare,8448,Fair,Very High,Low,0,Yes,...,Mid,Small,87,No,No,Yes,Poor,High,Left,0


In [26]:
# Remove 7 rows that > 50 years at company
data = data[data["years_at_company"] <= 50]

In [27]:
data[data["years_at_company"] > 50].head(7)

Unnamed: 0,Employee ID,age,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,...,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,attrition,gender_binary


<br><br>
### Job Role Column

In [28]:
data["job_role"].value_counts()

job_role
Technology    3813
Healthcare    3431
Education     3166
Media         2421
Finance       2062
Name: count, dtype: int64

In [29]:
len(data["job_role"].value_counts())

5

In [30]:
data.head()

Unnamed: 0,Employee ID,age,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,...,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,attrition,gender_binary
0,52685,36,13,Healthcare,8029,Excellent,High,Average,1,Yes,...,Mid,Large,22,No,No,No,Poor,Medium,Stayed,1
1,30585,35,7,Education,4563,Good,High,Average,1,Yes,...,Entry,Medium,27,No,No,No,Good,High,Left,1
2,54656,50,7,Education,5583,Fair,High,Average,3,Yes,...,Senior,Medium,76,No,No,Yes,Good,Low,Stayed,1
3,33442,58,44,Media,5525,Fair,Very High,High,0,Yes,...,Entry,Medium,96,No,No,No,Poor,Low,Left,1
4,15667,39,24,Education,4604,Good,High,Average,0,Yes,...,Mid,Large,45,Yes,No,No,Good,High,Stayed,1


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               14893 non-null  int64 
 1   age                       14893 non-null  int64 
 2   years_at_company          14893 non-null  int64 
 3   job_role                  14893 non-null  object
 4   monthly_income            14893 non-null  int64 
 5   work_life_balance         14893 non-null  object
 6   job_satisfaction          14893 non-null  object
 7   performance_rating        14893 non-null  object
 8   number_of_promotions      14893 non-null  int64 
 9   overtime                  14893 non-null  object
 10  distance_from_home        14893 non-null  int64 
 11  education_level           14893 non-null  object
 12  marital_status            14893 non-null  object
 13  number_of_dependents      14893 non-null  int64 
 14  job_level                 1

In [32]:
data.shape

(14893, 24)

In [33]:
data = pd.get_dummies(data, columns=['job_role'])
data = data.astype({col: int for col in data.columns if col.startswith('job_role_')})
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,distance_from_home,...,innovation_opportunities,company_reputation,employee_recognition,attrition,gender_binary,job_role_Education,job_role_Finance,job_role_Healthcare,job_role_Media,job_role_Technology
0,52685,36,13,8029,Excellent,High,Average,1,Yes,83,...,No,Poor,Medium,Stayed,1,0,0,1,0,0
1,30585,35,7,4563,Good,High,Average,1,Yes,55,...,No,Good,High,Left,1,1,0,0,0,0
2,54656,50,7,5583,Fair,High,Average,3,Yes,14,...,Yes,Good,Low,Stayed,1,1,0,0,0,0
3,33442,58,44,5525,Fair,Very High,High,0,Yes,43,...,No,Poor,Low,Left,1,0,0,0,1,0
4,15667,39,24,4604,Good,High,Average,0,Yes,47,...,No,Good,High,Stayed,1,1,0,0,0,0


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               14893 non-null  int64 
 1   age                       14893 non-null  int64 
 2   years_at_company          14893 non-null  int64 
 3   monthly_income            14893 non-null  int64 
 4   work_life_balance         14893 non-null  object
 5   job_satisfaction          14893 non-null  object
 6   performance_rating        14893 non-null  object
 7   number_of_promotions      14893 non-null  int64 
 8   overtime                  14893 non-null  object
 9   distance_from_home        14893 non-null  int64 
 10  education_level           14893 non-null  object
 11  marital_status            14893 non-null  object
 12  number_of_dependents      14893 non-null  int64 
 13  job_level                 14893 non-null  object
 14  company_size              1

In [35]:
data.shape

(14893, 28)

<br><br>
### Monthly Income Column

This column can be processed into five types
* Stay default 
* Standardization
* Normalization

In [36]:
data["monthly_income"].value_counts()

monthly_income
7687     11
8626     10
8342      9
6074      9
5143      9
         ..
5130      1
4136      1
4137      1
12514     1
10449     1
Name: count, Length: 6928, dtype: int64

In [37]:
len(data["monthly_income"].value_counts())

6928

In [38]:
max(data["monthly_income"])

15063

In [39]:
min(data["monthly_income"])

1226

#### Standardization

In [38]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data["monthly_income_standardized"] = scaler.fit_transform(data[["monthly_income"]])
# data

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,distance_from_home,...,company_reputation,employee_recognition,attrition,gender_binary,job_role_Education,job_role_Finance,job_role_Healthcare,job_role_Media,job_role_Technology,Monthly_Income_Standardized
0,52685,36,13,8029,Excellent,High,Average,1,Yes,83,...,Poor,Medium,Stayed,1,0,0,1,0,0,0.343898
1,30585,35,7,4563,Good,High,Average,1,Yes,55,...,Good,High,Left,1,1,0,0,0,0,-1.263265
2,54656,50,7,5583,Fair,High,Average,3,Yes,14,...,Good,Low,Stayed,1,1,0,0,0,0,-0.790297
3,33442,58,44,5525,Fair,Very High,High,0,Yes,43,...,Poor,Low,Left,1,0,0,0,1,0,-0.817191
4,15667,39,24,4604,Good,High,Average,0,Yes,47,...,Good,High,Stayed,1,1,0,0,0,0,-1.244253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,16243,56,42,7830,Poor,Medium,Average,0,Yes,40,...,Poor,Medium,Stayed,0,0,0,1,0,0,0.251623
14896,47175,30,15,3856,Good,Medium,Average,2,Yes,45,...,Good,Medium,Left,0,1,0,0,0,0,-1.591097
14897,12409,52,5,5654,Good,Very High,Below Average,0,No,4,...,Good,High,Left,1,1,0,0,0,0,-0.757375
14898,9554,18,4,5276,Fair,High,Average,0,No,13,...,Poor,High,Stayed,1,1,0,0,0,0,-0.932651


#### Normalization

In [41]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# data["monthly_income_normalized] = scaler.fit_transform(data[["monthly_income"]])
# data

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,distance_from_home,...,company_reputation,employee_recognition,attrition,gender_binary,job_role_Education,job_role_Finance,job_role_Healthcare,job_role_Media,job_role_Technology,Monthly_Income_Scaled
0,52685,36,13,8029,Excellent,High,Average,1,Yes,83,...,Poor,Medium,Stayed,1,0,0,1,0,0,0.491653
1,30585,35,7,4563,Good,High,Average,1,Yes,55,...,Good,High,Left,1,1,0,0,0,0,0.241165
2,54656,50,7,5583,Fair,High,Average,3,Yes,14,...,Good,Low,Stayed,1,1,0,0,0,0,0.314880
3,33442,58,44,5525,Fair,Very High,High,0,Yes,43,...,Poor,Low,Left,1,0,0,0,1,0,0.310689
4,15667,39,24,4604,Good,High,Average,0,Yes,47,...,Good,High,Stayed,1,1,0,0,0,0,0.244128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,16243,56,42,7830,Poor,Medium,Average,0,Yes,40,...,Poor,Medium,Stayed,0,0,0,1,0,0,0.477271
14896,47175,30,15,3856,Good,Medium,Average,2,Yes,45,...,Good,Medium,Left,0,1,0,0,0,0,0.190070
14897,12409,52,5,5654,Good,Very High,Below Average,0,No,4,...,Good,High,Left,1,1,0,0,0,0,0.320012
14898,9554,18,4,5276,Fair,High,Average,0,No,13,...,Poor,High,Stayed,1,1,0,0,0,0,0.292694


<br><br>
### Work-Life Balance Column

In [40]:
data["work_life_balance"].value_counts()

work_life_balance
Good         5627
Fair         4481
Excellent    2712
Poor         2073
Name: count, dtype: int64

In [41]:
len(data["work_life_balance"].value_counts())

4

In [42]:
data.shape

(14893, 28)

In [43]:
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,distance_from_home,...,innovation_opportunities,company_reputation,employee_recognition,attrition,gender_binary,job_role_Education,job_role_Finance,job_role_Healthcare,job_role_Media,job_role_Technology
0,52685,36,13,8029,Excellent,High,Average,1,Yes,83,...,No,Poor,Medium,Stayed,1,0,0,1,0,0
1,30585,35,7,4563,Good,High,Average,1,Yes,55,...,No,Good,High,Left,1,1,0,0,0,0
2,54656,50,7,5583,Fair,High,Average,3,Yes,14,...,Yes,Good,Low,Stayed,1,1,0,0,0,0
3,33442,58,44,5525,Fair,Very High,High,0,Yes,43,...,No,Poor,Low,Left,1,0,0,0,1,0
4,15667,39,24,4604,Good,High,Average,0,Yes,47,...,No,Good,High,Stayed,1,1,0,0,0,0


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               14893 non-null  int64 
 1   age                       14893 non-null  int64 
 2   years_at_company          14893 non-null  int64 
 3   monthly_income            14893 non-null  int64 
 4   work_life_balance         14893 non-null  object
 5   job_satisfaction          14893 non-null  object
 6   performance_rating        14893 non-null  object
 7   number_of_promotions      14893 non-null  int64 
 8   overtime                  14893 non-null  object
 9   distance_from_home        14893 non-null  int64 
 10  education_level           14893 non-null  object
 11  marital_status            14893 non-null  object
 12  number_of_dependents      14893 non-null  int64 
 13  job_level                 14893 non-null  object
 14  company_size              1

In [45]:
data = pd.get_dummies(data, columns=['work_life_balance'])
data = data.astype({col: int for col in data.columns if col.startswith('work_life_balance_')})
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,job_satisfaction,performance_rating,number_of_promotions,overtime,distance_from_home,education_level,...,gender_binary,job_role_Education,job_role_Finance,job_role_Healthcare,job_role_Media,job_role_Technology,work_life_balance_Excellent,work_life_balance_Fair,work_life_balance_Good,work_life_balance_Poor
0,52685,36,13,8029,High,Average,1,Yes,83,Masterâs Degree,...,1,0,0,1,0,0,1,0,0,0
1,30585,35,7,4563,High,Average,1,Yes,55,Associate Degree,...,1,1,0,0,0,0,0,0,1,0
2,54656,50,7,5583,High,Average,3,Yes,14,Associate Degree,...,1,1,0,0,0,0,0,1,0,0
3,33442,58,44,5525,Very High,High,0,Yes,43,Masterâs Degree,...,1,0,0,0,1,0,0,1,0,0
4,15667,39,24,4604,High,Average,0,Yes,47,Masterâs Degree,...,1,1,0,0,0,0,0,0,1,0


In [46]:
data.shape

(14893, 31)

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Employee ID                  14893 non-null  int64 
 1   age                          14893 non-null  int64 
 2   years_at_company             14893 non-null  int64 
 3   monthly_income               14893 non-null  int64 
 4   job_satisfaction             14893 non-null  object
 5   performance_rating           14893 non-null  object
 6   number_of_promotions         14893 non-null  int64 
 7   overtime                     14893 non-null  object
 8   distance_from_home           14893 non-null  int64 
 9   education_level              14893 non-null  object
 10  marital_status               14893 non-null  object
 11  number_of_dependents         14893 non-null  int64 
 12  job_level                    14893 non-null  object
 13  company_size                 14893 n

<br><br>
### Job Satisfaction Column

In [48]:
data["job_satisfaction"].value_counts()

job_satisfaction
High         7464
Very High    2966
Medium       2899
Low          1564
Name: count, dtype: int64

In [49]:
len(data["job_satisfaction"].value_counts())

4

In [50]:
data.shape

(14893, 31)

In [51]:
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,job_satisfaction,performance_rating,number_of_promotions,overtime,distance_from_home,education_level,...,gender_binary,job_role_Education,job_role_Finance,job_role_Healthcare,job_role_Media,job_role_Technology,work_life_balance_Excellent,work_life_balance_Fair,work_life_balance_Good,work_life_balance_Poor
0,52685,36,13,8029,High,Average,1,Yes,83,Masterâs Degree,...,1,0,0,1,0,0,1,0,0,0
1,30585,35,7,4563,High,Average,1,Yes,55,Associate Degree,...,1,1,0,0,0,0,0,0,1,0
2,54656,50,7,5583,High,Average,3,Yes,14,Associate Degree,...,1,1,0,0,0,0,0,1,0,0
3,33442,58,44,5525,Very High,High,0,Yes,43,Masterâs Degree,...,1,0,0,0,1,0,0,1,0,0
4,15667,39,24,4604,High,Average,0,Yes,47,Masterâs Degree,...,1,1,0,0,0,0,0,0,1,0


In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Employee ID                  14893 non-null  int64 
 1   age                          14893 non-null  int64 
 2   years_at_company             14893 non-null  int64 
 3   monthly_income               14893 non-null  int64 
 4   job_satisfaction             14893 non-null  object
 5   performance_rating           14893 non-null  object
 6   number_of_promotions         14893 non-null  int64 
 7   overtime                     14893 non-null  object
 8   distance_from_home           14893 non-null  int64 
 9   education_level              14893 non-null  object
 10  marital_status               14893 non-null  object
 11  number_of_dependents         14893 non-null  int64 
 12  job_level                    14893 non-null  object
 13  company_size                 14893 n

In [57]:
data = pd.get_dummies(data, columns=['job_satisfaction'])
data = data.astype({col: int for col in data.columns if col.startswith('job_satisfaction_')})
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,performance_rating,number_of_promotions,overtime,distance_from_home,education_level,marital_status,...,job_role_Media,job_role_Technology,work_life_balance_Excellent,work_life_balance_Fair,work_life_balance_Good,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High
0,52685,36,13,8029,Average,1,Yes,83,Masterâs Degree,Married,...,0,0,1,0,0,0,1,0,0,0
1,30585,35,7,4563,Average,1,Yes,55,Associate Degree,Single,...,0,0,0,0,1,0,1,0,0,0
2,54656,50,7,5583,Average,3,Yes,14,Associate Degree,Divorced,...,0,0,0,1,0,0,1,0,0,0
3,33442,58,44,5525,High,0,Yes,43,Masterâs Degree,Single,...,1,0,0,1,0,0,0,0,0,1
4,15667,39,24,4604,Average,0,Yes,47,Masterâs Degree,Married,...,0,0,0,0,1,0,1,0,0,0


In [60]:
data.shape

(14893, 34)

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Employee ID                  14893 non-null  int64 
 1   age                          14893 non-null  int64 
 2   years_at_company             14893 non-null  int64 
 3   monthly_income               14893 non-null  int64 
 4   performance_rating           14893 non-null  object
 5   number_of_promotions         14893 non-null  int64 
 6   overtime                     14893 non-null  object
 7   distance_from_home           14893 non-null  int64 
 8   education_level              14893 non-null  object
 9   marital_status               14893 non-null  object
 10  number_of_dependents         14893 non-null  int64 
 11  job_level                    14893 non-null  object
 12  company_size                 14893 non-null  object
 13  company_tenure               14893 n

<br><br>
### Performance Rating Column

In [64]:
data["performance_rating"].value_counts()

performance_rating
Average          8904
High             3021
Below Average    2189
Low               779
Name: count, dtype: int64

In [65]:
len(data["performance_rating"].value_counts())

4

In [66]:
data.shape

(14893, 34)

In [67]:
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,performance_rating,number_of_promotions,overtime,distance_from_home,education_level,marital_status,...,job_role_Media,job_role_Technology,work_life_balance_Excellent,work_life_balance_Fair,work_life_balance_Good,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High
0,52685,36,13,8029,Average,1,Yes,83,Masterâs Degree,Married,...,0,0,1,0,0,0,1,0,0,0
1,30585,35,7,4563,Average,1,Yes,55,Associate Degree,Single,...,0,0,0,0,1,0,1,0,0,0
2,54656,50,7,5583,Average,3,Yes,14,Associate Degree,Divorced,...,0,0,0,1,0,0,1,0,0,0
3,33442,58,44,5525,High,0,Yes,43,Masterâs Degree,Single,...,1,0,0,1,0,0,0,0,0,1
4,15667,39,24,4604,Average,0,Yes,47,Masterâs Degree,Married,...,0,0,0,0,1,0,1,0,0,0


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Employee ID                  14893 non-null  int64 
 1   age                          14893 non-null  int64 
 2   years_at_company             14893 non-null  int64 
 3   monthly_income               14893 non-null  int64 
 4   performance_rating           14893 non-null  object
 5   number_of_promotions         14893 non-null  int64 
 6   overtime                     14893 non-null  object
 7   distance_from_home           14893 non-null  int64 
 8   education_level              14893 non-null  object
 9   marital_status               14893 non-null  object
 10  number_of_dependents         14893 non-null  int64 
 11  job_level                    14893 non-null  object
 12  company_size                 14893 non-null  object
 13  company_tenure               14893 n

In [69]:
data = pd.get_dummies(data, columns=['performance_rating'])
data = data.astype({col: int for col in data.columns if col.startswith('performance_rating_')})
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,number_of_promotions,overtime,distance_from_home,education_level,marital_status,number_of_dependents,...,work_life_balance_Good,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High,performance_rating_Average,performance_rating_Below Average,performance_rating_High,performance_rating_Low
0,52685,36,13,8029,1,Yes,83,Masterâs Degree,Married,1,...,0,0,1,0,0,0,1,0,0,0
1,30585,35,7,4563,1,Yes,55,Associate Degree,Single,4,...,1,0,1,0,0,0,1,0,0,0
2,54656,50,7,5583,3,Yes,14,Associate Degree,Divorced,2,...,0,0,1,0,0,0,1,0,0,0
3,33442,58,44,5525,0,Yes,43,Masterâs Degree,Single,4,...,0,0,0,0,0,1,0,0,1,0
4,15667,39,24,4604,0,Yes,47,Masterâs Degree,Married,6,...,1,0,1,0,0,0,1,0,0,0


In [70]:
data.shape

(14893, 37)

In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 37 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Employee ID                       14893 non-null  int64 
 1   age                               14893 non-null  int64 
 2   years_at_company                  14893 non-null  int64 
 3   monthly_income                    14893 non-null  int64 
 4   number_of_promotions              14893 non-null  int64 
 5   overtime                          14893 non-null  object
 6   distance_from_home                14893 non-null  int64 
 7   education_level                   14893 non-null  object
 8   marital_status                    14893 non-null  object
 9   number_of_dependents              14893 non-null  int64 
 10  job_level                         14893 non-null  object
 11  company_size                      14893 non-null  object
 12  company_tenure         

<br><br>
### Number of Promotions Column

In [72]:
data["number_of_promotions"].value_counts()

number_of_promotions
0    7426
1    3714
2    2748
3     804
4     201
Name: count, dtype: int64

In [73]:
len(data["number_of_promotions"].value_counts())

5

<br><br>
### Overtime Column

In [74]:
data["overtime"].value_counts()

overtime
No     10003
Yes     4890
Name: count, dtype: int64

In [75]:
len(data["overtime"].value_counts())

2

In [77]:
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,number_of_promotions,overtime,distance_from_home,education_level,marital_status,number_of_dependents,...,work_life_balance_Good,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High,performance_rating_Average,performance_rating_Below Average,performance_rating_High,performance_rating_Low
0,52685,36,13,8029,1,Yes,83,Masterâs Degree,Married,1,...,0,0,1,0,0,0,1,0,0,0
1,30585,35,7,4563,1,Yes,55,Associate Degree,Single,4,...,1,0,1,0,0,0,1,0,0,0
2,54656,50,7,5583,3,Yes,14,Associate Degree,Divorced,2,...,0,0,1,0,0,0,1,0,0,0
3,33442,58,44,5525,0,Yes,43,Masterâs Degree,Single,4,...,0,0,0,0,0,1,0,0,1,0
4,15667,39,24,4604,0,Yes,47,Masterâs Degree,Married,6,...,1,0,1,0,0,0,1,0,0,0


In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 37 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Employee ID                       14893 non-null  int64 
 1   age                               14893 non-null  int64 
 2   years_at_company                  14893 non-null  int64 
 3   monthly_income                    14893 non-null  int64 
 4   number_of_promotions              14893 non-null  int64 
 5   overtime                          14893 non-null  object
 6   distance_from_home                14893 non-null  int64 
 7   education_level                   14893 non-null  object
 8   marital_status                    14893 non-null  object
 9   number_of_dependents              14893 non-null  int64 
 10  job_level                         14893 non-null  object
 11  company_size                      14893 non-null  object
 12  company_tenure         

In [79]:
data["overtime"] = data["overtime"].map({"Yes": 1, "No": 0})
data.head()

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,number_of_promotions,overtime,distance_from_home,education_level,marital_status,number_of_dependents,...,work_life_balance_Good,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High,performance_rating_Average,performance_rating_Below Average,performance_rating_High,performance_rating_Low
0,52685,36,13,8029,1,1,83,Masterâs Degree,Married,1,...,0,0,1,0,0,0,1,0,0,0
1,30585,35,7,4563,1,1,55,Associate Degree,Single,4,...,1,0,1,0,0,0,1,0,0,0
2,54656,50,7,5583,3,1,14,Associate Degree,Divorced,2,...,0,0,1,0,0,0,1,0,0,0
3,33442,58,44,5525,0,1,43,Masterâs Degree,Single,4,...,0,0,0,0,0,1,0,0,1,0
4,15667,39,24,4604,0,1,47,Masterâs Degree,Married,6,...,1,0,1,0,0,0,1,0,0,0


In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14893 entries, 0 to 14899
Data columns (total 37 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Employee ID                       14893 non-null  int64 
 1   age                               14893 non-null  int64 
 2   years_at_company                  14893 non-null  int64 
 3   monthly_income                    14893 non-null  int64 
 4   number_of_promotions              14893 non-null  int64 
 5   overtime                          14893 non-null  int64 
 6   distance_from_home                14893 non-null  int64 
 7   education_level                   14893 non-null  object
 8   marital_status                    14893 non-null  object
 9   number_of_dependents              14893 non-null  int64 
 10  job_level                         14893 non-null  object
 11  company_size                      14893 non-null  object
 12  company_tenure         

<br><br>
### Distance from Home Column

In [90]:
data["distance_from_home"].value_counts()

distance_from_home
8     185
53    177
66    176
67    173
62    169
     ... 
21    128
65    126
32    126
73    121
60    121
Name: count, Length: 99, dtype: int64

In [82]:
len(data["distance_from_home"].value_counts())

99

In [84]:
max(data["distance_from_home"])

99

In [85]:
min(data["distance_from_home"])

1

In [91]:
max(data["distance_from_home"].value_counts())

185

In [93]:
min(data["distance_from_home"].value_counts())

121

#### Standardization

In [94]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data["distance_from_home_standardized"] = scaler.fit_transform(data[["distance_from_home"]])
# data

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,number_of_promotions,overtime,distance_from_home,education_level,marital_status,number_of_dependents,...,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High,performance_rating_Average,performance_rating_Below Average,performance_rating_High,performance_rating_Low,distance_from_home_standardized
0,52685,36,13,8029,1,1,83,Masterâs Degree,Married,1,...,0,1,0,0,0,1,0,0,0,1.152416
1,30585,35,7,4563,1,1,55,Associate Degree,Single,4,...,0,1,0,0,0,1,0,0,0,0.176826
2,54656,50,7,5583,3,1,14,Associate Degree,Divorced,2,...,0,1,0,0,0,1,0,0,0,-1.251716
3,33442,58,44,5525,0,1,43,Masterâs Degree,Single,4,...,0,0,0,0,1,0,0,1,0,-0.241284
4,15667,39,24,4604,0,1,47,Masterâs Degree,Married,6,...,0,1,0,0,0,1,0,0,0,-0.101914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,16243,56,42,7830,0,1,40,Associate Degree,Single,0,...,1,0,0,1,0,1,0,0,0,-0.345812
14896,47175,30,15,3856,2,1,45,Masterâs Degree,Married,0,...,0,0,0,1,0,1,0,0,0,-0.171599
14897,12409,52,5,5654,0,0,4,Associate Degree,Married,4,...,0,0,0,0,1,0,1,0,0,-1.600141
14898,9554,18,4,5276,0,0,13,Bachelorâs Degree,Divorced,3,...,0,1,0,0,0,1,0,0,0,-1.286559


#### Normalization

In [95]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# data["distance_from_home_normalized"] = scaler.fit_transform(data[["distance_from_home"]])
# data

Unnamed: 0,Employee ID,age,years_at_company,monthly_income,number_of_promotions,overtime,distance_from_home,education_level,marital_status,number_of_dependents,...,work_life_balance_Poor,job_satisfaction_High,job_satisfaction_Low,job_satisfaction_Medium,job_satisfaction_Very High,performance_rating_Average,performance_rating_Below Average,performance_rating_High,performance_rating_Low,distance_from_home_normalized
0,52685,36,13,8029,1,1,83,Masterâs Degree,Married,1,...,0,1,0,0,0,1,0,0,0,0.836735
1,30585,35,7,4563,1,1,55,Associate Degree,Single,4,...,0,1,0,0,0,1,0,0,0,0.551020
2,54656,50,7,5583,3,1,14,Associate Degree,Divorced,2,...,0,1,0,0,0,1,0,0,0,0.132653
3,33442,58,44,5525,0,1,43,Masterâs Degree,Single,4,...,0,0,0,0,1,0,0,1,0,0.428571
4,15667,39,24,4604,0,1,47,Masterâs Degree,Married,6,...,0,1,0,0,0,1,0,0,0,0.469388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,16243,56,42,7830,0,1,40,Associate Degree,Single,0,...,1,0,0,1,0,1,0,0,0,0.397959
14896,47175,30,15,3856,2,1,45,Masterâs Degree,Married,0,...,0,0,0,1,0,1,0,0,0,0.448980
14897,12409,52,5,5654,0,0,4,Associate Degree,Married,4,...,0,0,0,0,1,0,1,0,0,0.030612
14898,9554,18,4,5276,0,0,13,Bachelorâs Degree,Divorced,3,...,0,1,0,0,0,1,0,0,0,0.122449
