# Attr data cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
dat = pd.read_csv("dataset_used/attr.csv")
data = dat.copy()
pd.set_option('display.max_columns', None)

In [3]:
data.sample(5)

Unnamed: 0,id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1279,1280,44,Yes,Travel_Frequently,429,Research & Development,1,2,Medical,1,1792,3,Male,99,3,1,Research Scientist,2,Divorced,2342,11092,1,Y,Yes,12,3,3,80,3,6,2,2,5,3,2,3
599,600,36,No,Travel_Rarely,1041,Human Resources,13,3,Human Resources,1,829,3,Male,36,3,1,Human Resources,2,Married,2143,25527,4,Y,No,13,3,2,80,1,8,2,3,5,2,0,4
1201,1202,23,Yes,Travel_Rarely,1320,Research & Development,8,1,Medical,1,1684,4,Male,93,2,1,Laboratory Technician,3,Single,3989,20586,1,Y,Yes,11,3,1,80,0,5,2,3,5,4,1,2
540,541,28,Yes,Travel_Rarely,654,Research & Development,1,2,Life Sciences,1,741,1,Female,67,1,1,Research Scientist,2,Single,2216,3872,7,Y,Yes,13,3,4,80,0,10,4,3,7,7,3,7
1040,1041,40,No,Non-Travel,218,Research & Development,8,1,Medical,1,1468,4,Male,55,2,3,Research Director,2,Divorced,13757,25178,2,Y,No,11,3,3,80,1,16,5,3,9,8,4,8


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        1470 non-null   int64 
 1   Age                       1470 non-null   int64 
 2   Attrition                 1470 non-null   object
 3   BusinessTravel            1470 non-null   object
 4   DailyRate                 1470 non-null   int64 
 5   Department                1470 non-null   object
 6   DistanceFromHome          1470 non-null   int64 
 7   Education                 1470 non-null   int64 
 8   EducationField            1470 non-null   object
 9   EmployeeCount             1470 non-null   int64 
 10  EmployeeNumber            1470 non-null   int64 
 11  EnvironmentSatisfaction   1470 non-null   int64 
 12  Gender                    1470 non-null   object
 13  HourlyRate                1470 non-null   int64 
 14  JobInvolvement          

The `data.info()` allows us to see a quick overview of our data and with that we were able to deduce that the data has no missing values and it also has 36 columns and 1470 rows of data, with each column being either an integer ot an object

# Cleaning the data column by column

### Age

For this we can create a bin of ages but for that we would have to get the youngest and oldest age in the column so we know how to create the bin

In [5]:
for i in sorted(data.Age.unique()):
    print(i, end=", ")

18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 

In [6]:
print(f"The youngest age in the data is {data.Age.unique().min()}\n\
The oldest age in the data is {data.Age.unique().max()}")

The youngest age in the data is 18
The oldest age in the data is 60


In [7]:
bins = [18, 20, 30, 40, 50, 60]
group = ['<20', '20-29', '30-39', '40-49', '50-59']

# Filling NaN with 60+ because we used the right False keyword which will by default ignore the rightmost digit (60)
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=group, right=False).cat.add_categories('60+').fillna('60+')

### Attrition

In [8]:
data.Attrition.unique()

array(['Yes', 'No'], dtype=object)

Everything looks okay here so we move on to the next
### BusinessTravel

In [9]:
data.BusinessTravel.unique()

array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], dtype=object)

In [10]:
data.BusinessTravel = data.BusinessTravel.str.replace('_', ' ').str.replace("-", " ")
data.BusinessTravel.unique()

array(['Travel Rarely', 'Travel Frequently', 'Non Travel'], dtype=object)

Everything looks okay here so we move on to the next
### Department

In [11]:
data.Department.unique()

array(['Sales', 'Research & Development', 'Human Resources'], dtype=object)

Everything looks okay here so we move on to the next
### DistanceFromHome

In [12]:
for i in sorted(data.DistanceFromHome.unique()):
    print(i, end=', ')

1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 

In [13]:
# I will create a bin for the distance
bins = [1, 11, 20, 30]
labels = ['Very Close', 'Close', 'Far']
data['DistanceFrom_Home'] = pd.cut(data['DistanceFromHome'], bins=bins, labels=labels, right=False)

Everything looks okay here so we move on to the next
### Education

In [14]:
data.Education.unique()

array([2, 1, 4, 3, 5], dtype=int64)

In [15]:
data.Education =  data.Education.map({1: "Below College",
                                      2: "College",
                                      3: "Bachelor",
                                      4: "Master",
                                      5: "Doctor"})

In [16]:
data.Education.unique()

array(['College', 'Below College', 'Master', 'Bachelor', 'Doctor'],
      dtype=object)

Everything looks okay here so we move on to the next
### EducationField

In [17]:
data.EducationField.unique()

array(['Life Sciences', 'Other', 'Medical', 'Marketing',
       'Technical Degree', 'Human Resources'], dtype=object)

Everything looks okay here so we move on to the next
### EmployeeCount

In [18]:
data.EmployeeCount.unique()

array([1], dtype=int64)

Everything looks okay here so we move on to the next
### EnvironmentSatisfaction

In [19]:
data.EnvironmentSatisfaction.unique()

array([2, 3, 4, 1], dtype=int64)

In [20]:
data.EnvironmentSatisfaction = data.EnvironmentSatisfaction.map({1: 'Low',
                                                                 2: 'Medium',
                                                                 3: 'High',
                                                                 4: 'Very High'})

In [21]:
data.EnvironmentSatisfaction.unique()

array(['Medium', 'High', 'Very High', 'Low'], dtype=object)

Everything looks okay here so we move on to the next
### Gender

In [22]:
data.Gender.unique()

array(['Female', 'Male'], dtype=object)

Everything looks okay here so we move on to the next
### JobInvolvement

In [23]:
data.JobInvolvement.unique()

array([3, 2, 4, 1], dtype=int64)

In [24]:
data.JobInvolvement = data.JobInvolvement.map({1: 'Low',
                                               2: 'Medium',
                                               3: 'High',
                                               4: 'Very High'})

In [25]:
data.JobInvolvement.unique()

array(['High', 'Medium', 'Very High', 'Low'], dtype=object)

Everything looks okay here so we move on to the next
### JobLevel

In [26]:
data.JobLevel.unique()

array([2, 1, 3, 4, 5], dtype=int64)

In [27]:
data.JobLevel = "Lvl " + data.JobLevel.astype(str)

In [28]:
data.JobLevel.unique()

array(['Lvl 2', 'Lvl 1', 'Lvl 3', 'Lvl 4', 'Lvl 5'], dtype=object)

Everything looks okay here so we move on to the next
### JobRole

In [29]:
data.JobRole.unique()

array(['Sales Executive', 'Research Scientist', 'Laboratory Technician',
       'Manufacturing Director', 'Healthcare Representative', 'Manager',
       'Sales Representative', 'Research Director', 'Human Resources'],
      dtype=object)

Everything looks okay here so we move on to the next
### JobSatisfaction

In [30]:
data.JobSatisfaction.unique()

array([4, 2, 3, 1], dtype=int64)

In [31]:
data.JobSatisfaction =data.JobSatisfaction.map({1: 'Low',
                                                2: 'Medium',
                                                3: 'High',
                                                4: 'Very High'})

In [32]:
data.JobSatisfaction.unique()

array(['Very High', 'Medium', 'High', 'Low'], dtype=object)

Everything looks okay here so we move on to the next
### MaritalStatus

In [33]:
data.MaritalStatus.unique()

array(['Single', 'Married', 'Divorced'], dtype=object)

Everything looks okay here so we move on to the next
### OverTime

In [34]:
data.OverTime.unique()

array(['Yes', 'No'], dtype=object)

Everything looks okay here so we move on to the next
### PerformanceRating

In [35]:
data.PerformanceRating.unique()

array([3, 4], dtype=int64)

In [36]:
data.PerformanceRating = data.PerformanceRating.map({3: "Excellent",
                                                     4: "Outstanding"})

In [37]:
data.PerformanceRating.unique()

array(['Excellent', 'Outstanding'], dtype=object)

Everything looks okay here so we move on to the next
### RelationshipSatisfaction

In [38]:
data.RelationshipSatisfaction.unique()

array([1, 4, 2, 3], dtype=int64)

In [39]:
data.RelationshipSatisfaction = data.RelationshipSatisfaction.map({1: 'Low',
                                                                   2: 'Medium',
                                                                   3: 'High',
                                                                   4: 'Very High'})

In [40]:
data.RelationshipSatisfaction.unique()

array(['Low', 'Very High', 'Medium', 'High'], dtype=object)

Everything looks okay here so we move on to the next
### StockOptionLevel

In [41]:
data.StockOptionLevel.unique()

array([0, 1, 3, 2], dtype=int64)

In [42]:
data.StockOptionLevel = "Lvl " + data.StockOptionLevel.astype(str)

In [43]:
data.StockOptionLevel.unique()

array(['Lvl 0', 'Lvl 1', 'Lvl 3', 'Lvl 2'], dtype=object)

Everything looks okay here so we move on to the next
### WorkLifeBalance

In [44]:
data.WorkLifeBalance.unique()

array([1, 3, 2, 4], dtype=int64)

In [45]:
data.WorkLifeBalance = data.WorkLifeBalance.map({1: 'Bad',
                                                 2: 'Good',
                                                 3: 'Better',
                                                 4: 'Best'})

In [46]:
data.WorkLifeBalance.unique()

array(['Bad', 'Better', 'Good', 'Best'], dtype=object)

Everything looks okay here so we move on to the next
### All columns that deals with years

In [47]:
# We start by creating BINS for this year group
bins = [0, 10, 20, 30]
group = ['0-10 years', '11-20 years', '21-29 years']

# Creating BINS
data['TotalWorkingYears'] = pd.cut(data['TotalWorkingYears'],
                                   bins=bins,
                                   labels=group,
                                   right=False).cat.add_categories('31-40 years').fillna('31-40 years')


data['YearsAtCompany'] = pd.cut(data['YearsAtCompany'],
                                bins=bins,
                                labels=group,
                                right=False).cat.add_categories('31-40 years').fillna('31-40 years')

In [48]:
# Appending years to the remainig data that deals with years
yrs = ['TrainingTimesLastYear', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [49]:
# Funciton to append years or year to data
def years(col):
    data[col] = data[col].astype(str) + " " + np.where(data[col] == 0, 'year', 'years')

In [50]:
for i in yrs:
    years(i)
    print(f"{i} Successfully imputed\n")

TrainingTimesLastYear Successfully imputed

YearsInCurrentRole Successfully imputed

YearsSinceLastPromotion Successfully imputed

YearsWithCurrManager Successfully imputed



In [51]:
data.sample(4)

Unnamed: 0,id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,AgeGroup,DistanceFrom_Home
1151,1152,28,No,Travel Rarely,580,Research & Development,27,Bachelor,Medical,1,1622,Medium,Female,39,Low,Lvl 2,Manufacturing Director,Low,Divorced,4877,20460,0,Y,No,21,Outstanding,Medium,80,Lvl 1,0-10 years,5 years,Good,0-10 years,3 years,0 year,0 year,20-29,Far
629,630,28,No,Travel Rarely,1169,Human Resources,8,College,Medical,1,869,Medium,Male,63,Medium,Lvl 1,Human Resources,Very High,Divorced,4936,23965,1,Y,No,13,Excellent,Very High,80,Lvl 1,0-10 years,6 years,Better,0-10 years,1 years,0 year,4 years,20-29,Very Close
446,447,41,No,Non Travel,267,Sales,10,College,Life Sciences,1,599,Very High,Male,56,High,Lvl 2,Sales Executive,Very High,Single,6230,13430,7,Y,No,14,Excellent,Very High,80,Lvl 0,11-20 years,3 years,Better,11-20 years,3 years,1 years,10 years,40-49,Very Close
584,585,42,No,Travel Frequently,570,Research & Development,8,Bachelor,Life Sciences,1,809,Medium,Male,66,High,Lvl 5,Manager,Very High,Divorced,18430,16225,1,Y,No,13,Excellent,Medium,80,Lvl 1,21-29 years,4 years,Good,21-29 years,7 years,14 years,9 years,40-49,Very Close


#### Selecting the columns that would bring more meaning during analysis

In [52]:
data = data[['AgeGroup', 'Gender', 'MaritalStatus', 'Attrition', 'BusinessTravel', 'DistanceFrom_Home', 'Department',
             'Education', 'EducationField', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobRole',
             'JobSatisfaction', 'HourlyRate', 'DailyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
             'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
             'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 
             'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']]

In [53]:
# Old dataframe
dat.sample(10)

Unnamed: 0,id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
467,468,37,No,Non-Travel,142,Sales,9,4,Medical,1,626,1,Male,69,3,3,Sales Executive,2,Divorced,8834,24666,1,Y,No,13,3,4,80,1,9,6,3,9,5,7,7
502,503,53,No,Travel_Rarely,238,Sales,1,1,Medical,1,682,4,Female,34,3,2,Sales Executive,1,Single,8381,7507,7,Y,No,20,4,4,80,0,18,2,4,14,7,8,10
1266,1267,41,No,Travel_Rarely,548,Research & Development,9,4,Life Sciences,1,1772,3,Male,94,3,1,Laboratory Technician,1,Divorced,2289,20520,1,Y,No,20,4,2,80,2,5,2,3,5,3,0,4
438,439,35,No,Travel_Rarely,1276,Research & Development,16,3,Life Sciences,1,586,4,Male,72,3,3,Healthcare Representative,3,Married,7632,14295,4,Y,Yes,12,3,3,80,0,10,2,3,8,7,0,0
977,978,34,No,Non-Travel,999,Research & Development,26,1,Technical Degree,1,1374,1,Female,92,2,1,Research Scientist,3,Divorced,2029,15891,1,Y,No,20,4,3,80,3,5,2,3,5,4,0,0
1233,1234,30,No,Travel_Rarely,793,Research & Development,16,1,Life Sciences,1,1729,2,Male,33,3,1,Research Scientist,4,Married,2862,3811,1,Y,No,12,3,2,80,1,10,2,2,10,0,0,8
205,206,29,Yes,Travel_Rarely,121,Sales,27,3,Marketing,1,283,2,Female,35,3,3,Sales Executive,4,Married,7639,24525,1,Y,No,22,4,4,80,3,10,3,2,10,4,1,9
332,333,54,No,Travel_Frequently,928,Research & Development,20,4,Life Sciences,1,450,4,Female,31,3,2,Research Scientist,3,Single,4869,16885,3,Y,No,12,3,4,80,0,20,4,2,4,3,0,3
705,706,39,No,Travel_Rarely,903,Sales,2,5,Life Sciences,1,985,1,Male,41,4,3,Sales Executive,3,Single,7880,2560,0,Y,No,18,3,4,80,0,9,3,3,8,7,0,7
1283,1284,28,No,Travel_Rarely,1181,Research & Development,1,3,Life Sciences,1,1799,3,Male,82,3,1,Research Scientist,4,Married,2044,5531,1,Y,No,11,3,3,80,1,5,6,4,5,3,0,3


In [54]:
# New dataframe
data.sample(10)

Unnamed: 0,AgeGroup,Gender,MaritalStatus,Attrition,BusinessTravel,DistanceFrom_Home,Department,Education,EducationField,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobRole,JobSatisfaction,HourlyRate,DailyRate,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
719,40-49,Female,Single,No,Travel Rarely,Very Close,Sales,College,Life Sciences,Very High,High,Lvl 2,Sales Executive,Very High,83,955,4163,8571,1,Yes,17,Excellent,High,80,Lvl 0,0-10 years,0 year,Better,0-10 years,0 year,0 year,7 years
1058,30-39,Female,Single,Yes,Travel Rarely,Far,Sales,Master,Medical,Low,Medium,Lvl 2,Sales Executive,Medium,40,790,4599,7815,0,Yes,23,Outstanding,High,80,Lvl 0,11-20 years,2 years,Best,11-20 years,9 years,10 years,10 years
770,40-49,Male,Divorced,No,Travel Rarely,Very Close,Research & Development,Master,Medical,Very High,High,Lvl 5,Research Director,Very High,40,430,19627,21445,9,No,17,Excellent,Very High,80,Lvl 2,21-29 years,0 year,Better,0-10 years,2 years,2 years,2 years
1216,40-49,Male,Married,No,Travel Rarely,Very Close,Sales,Bachelor,Medical,Very High,High,Lvl 2,Sales Executive,Very High,73,1179,7847,6069,1,Yes,17,Excellent,Low,80,Lvl 1,11-20 years,3 years,Better,11-20 years,9 years,8 years,8 years
928,40-49,Female,Married,Yes,Travel Rarely,Close,Research & Development,Bachelor,Medical,Low,High,Lvl 3,Healthcare Representative,Very High,73,621,7978,14075,1,No,11,Excellent,Very High,80,Lvl 1,11-20 years,2 years,Better,11-20 years,7 years,0 year,5 years
29,40-49,Female,Single,No,Travel Rarely,Very Close,Sales,Master,Marketing,Medium,High,Lvl 5,Manager,Low,83,705,18947,22822,3,No,12,Excellent,Very High,80,Lvl 0,21-29 years,2 years,Good,0-10 years,2 years,2 years,1 years
1151,20-29,Female,Divorced,No,Travel Rarely,Far,Research & Development,Bachelor,Medical,Medium,Low,Lvl 2,Manufacturing Director,Low,39,580,4877,20460,0,No,21,Outstanding,Medium,80,Lvl 1,0-10 years,5 years,Good,0-10 years,3 years,0 year,0 year
1349,20-29,Female,Married,No,Travel Rarely,Very Close,Research & Development,College,Life Sciences,Medium,Medium,Lvl 1,Research Scientist,High,90,482,2933,14908,1,Yes,13,Excellent,High,80,Lvl 1,0-10 years,3 years,Good,0-10 years,0 year,1 years,0 year
774,50-59,Male,Single,No,Non Travel,Very Close,Research & Development,Below College,Medical,High,Medium,Lvl 4,Manager,Low,40,444,16756,17323,7,No,15,Excellent,Medium,80,Lvl 0,31-40 years,3 years,Best,0-10 years,7 years,6 years,2 years
1080,40-49,Female,Married,No,Travel Rarely,Very Close,Sales,Bachelor,Life Sciences,High,High,Lvl 4,Manager,Medium,51,228,16606,11380,8,No,12,Excellent,Very High,80,Lvl 1,21-29 years,2 years,Best,11-20 years,12 years,5 years,1 years


In [55]:
# Save to csv
data.to_csv("attr_cleaned.csv")

# The end