## Import libraries

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Preprocess data

In [68]:
raw_data = pd.read_csv('Absenteeism_data.csv')

In [69]:
data = raw_data.copy()

In [70]:
data

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
695,17,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,28,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,18,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,25,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [71]:
data.describe()

Unnamed: 0,ID,Reason for Absence,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,17.951429,19.411429,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,36.0,28.0,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0


In [72]:
data.isna().sum()

ID                           0
Reason for Absence           0
Date                         0
Transportation Expense       0
Distance to Work             0
Age                          0
Daily Work Load Average      0
Body Mass Index              0
Education                    0
Children                     0
Pets                         0
Absenteeism Time in Hours    0
dtype: int64

In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


### Drop ID column

In [74]:
data.drop('ID', axis = 1, inplace = True)

In [75]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


### Create dummies for reasons

In [76]:
# drop first to avoid multicolllinearity
reasons_dummies = pd.get_dummies(data['Reason for Absence'], prefix = 'Reason', drop_first = True)
reasons_dummies

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Reason_5,Reason_6,Reason_7,Reason_8,Reason_9,Reason_10,...,Reason_18,Reason_19,Reason_21,Reason_22,Reason_23,Reason_24,Reason_25,Reason_26,Reason_27,Reason_28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
697,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [77]:
data = data.join(reasons_dummies)

In [78]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,...,Reason_18,Reason_19,Reason_21,Reason_22,Reason_23,Reason_24,Reason_25,Reason_26,Reason_27,Reason_28
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,0,0,0,0,1,0,0
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,...,0,0,0,0,0,0,0,0,0,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,...,0,0,0,0,0,0,0,0,0,0
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,...,0,0,0,0,0,0,0,0,0,0
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,...,0,0,0,0,0,0,0,0,0,0
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,...,0,0,0,0,1,0,0,0,0,0


### Group dummies into 4 groups

In [79]:
# Group reason into Group 1 from Reason 1 to 15
data['Reason_Group_1'] = data['Reason_1']
for i in range(2,15):
    data['Reason_Group_1'] += data[f'Reason_{i}']

In [80]:
data['Reason_Group_1'].unique()

array([0, 1], dtype=uint8)

In [81]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,...,Reason_19,Reason_21,Reason_22,Reason_23,Reason_24,Reason_25,Reason_26,Reason_27,Reason_28,Reason_Group_1
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,0,0,0,1,0,0,0
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,...,0,0,0,0,0,0,0,0,0,1
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,...,0,0,0,0,0,0,0,0,0,1
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,...,0,0,0,0,0,0,0,0,0,1
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,...,0,0,0,0,0,0,0,0,0,1
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,...,0,0,0,1,0,0,0,0,0,0


In [82]:
# Drop the old ones before grouping
for i in range(1,15):
    data.drop(f'Reason_{i}', axis = 1, inplace = True)

In [83]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,...,Reason_19,Reason_21,Reason_22,Reason_23,Reason_24,Reason_25,Reason_26,Reason_27,Reason_28,Reason_Group_1
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,0,0,0,1,0,0,0
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,...,0,0,0,0,0,0,0,0,0,1
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,...,0,0,0,0,0,0,0,0,0,1
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,...,0,0,0,0,0,0,0,0,0,1
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,...,0,0,0,0,0,0,0,0,0,1
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,...,0,0,0,1,0,0,0,0,0,0


In [84]:
# Group reason into Group 2 from Reason 15 to 17
data['Reason_Group_2'] = data['Reason_15']
for i in (16,17):
    data['Reason_Group_2'] += data[f'Reason_{i}']

In [85]:
for i in (15,16,17):
    data.drop(f'Reason_{i}', axis = 1, inplace = True)

In [86]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,...,Reason_21,Reason_22,Reason_23,Reason_24,Reason_25,Reason_26,Reason_27,Reason_28,Reason_Group_1,Reason_Group_2
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,0,0,1,0,0,0,0
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,...,0,0,0,0,0,0,0,0,1,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,...,0,0,0,0,0,0,0,0,1,0
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,...,0,0,0,0,0,0,0,0,1,0
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,...,0,0,0,0,0,0,0,0,1,0
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,...,0,0,1,0,0,0,0,0,0,0


In [87]:
# Group reason 3 into Group 3 from Reason 18 to 21
# There is no reason 20, so we skip
data['Reason_Group_3'] = data['Reason_18']
for i in (19,21):
    data['Reason_Group_3'] += data[f'Reason_{i}']

In [88]:
for i in (18,19,21):
    data.drop(f'Reason_{i}', axis = 1, inplace = True)

In [89]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,...,Reason_22,Reason_23,Reason_24,Reason_25,Reason_26,Reason_27,Reason_28,Reason_Group_1,Reason_Group_2,Reason_Group_3
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,...,0,0,0,0,1,0,0,0,0,0
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,...,0,0,0,0,0,0,0,1,0,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,...,0,0,0,0,0,0,0,1,0,0
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,...,0,0,0,0,0,0,0,1,0,0
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,...,0,0,0,0,0,0,0,1,0,0
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,...,0,1,0,0,0,0,0,0,0,0


In [90]:
# Group reason 4 into Group 3 from Reason 22 to 28
data['Reason_Group_4'] = data['Reason_22']
for i in range(23,29):
    data['Reason_Group_4'] += data[f'Reason_{i}']

In [91]:
for i in range(22,29):
    data.drop(f'Reason_{i}', axis = 1, inplace = True)

In [92]:
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


### A better way to group reason

In [93]:
reason_groups = pd.DataFrame()
reason_groups['Reason_Group_1'] = reasons_dummies.loc[:,'Reason_1':'Reason_14'].max(axis = 1)
reason_groups['Reason_Group_2'] = reasons_dummies.loc[:,'Reason_15':'Reason_17'].max(axis = 1)
reason_groups['Reason_Group_3'] = reasons_dummies.loc[:,'Reason_18':'Reason_21'].max(axis = 1)
reason_groups['Reason_Group_4'] = reasons_dummies.loc[:,'Reason_22':].max(axis = 1)
reason_groups

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4
0,0,0,0,1
1,0,0,0,0
2,0,0,0,1
3,1,0,0,0
4,0,0,0,1
...,...,...,...,...
695,1,0,0,0
696,1,0,0,0
697,1,0,0,0
698,0,0,0,1


In [94]:
data.drop('Reason for Absence', axis = 1, inplace = True)

In [95]:
data

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


## Create a checkpoint

In [96]:
data_modified_v1 = data.copy()

In [97]:
data_modified_v1

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


## Rearrange data a bit

In [98]:
data_modified_v1.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_Group_1',
       'Reason_Group_2', 'Reason_Group_3', 'Reason_Group_4'], dtype=object)

In [99]:
rearranged_columns = ['Reason_Group_1',
       'Reason_Group_2', 'Reason_Group_3', 'Reason_Group_4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [100]:
data_modified_v1 = data_modified_v1[rearranged_columns]

In [101]:
data_modified_v1

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,24/05/2018,235,16,32,237.656,25,3,0,0,2


### Extract day and month from 'Date' and drop it

In [102]:
type(data_modified_v1['Date'][0])

str

In [103]:
# don't forget the format, it can f*ck up 
data_modified_v1['Date'] = pd.to_datetime(data_modified_v1['Date'], format = '%d/%m/%Y')

In [104]:
type(data_modified_v1['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [105]:
data_modified_v1['Date'][0].month

7

In [106]:
data_modified_v1

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2


In [107]:
data_modified_v1['Month'] = data_modified_v1['Date'].apply(lambda x: x.month)

In [109]:
type(data_modified_v1['Month'][0])

numpy.int64

In [111]:
data_modified_v1

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8,5
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3,5
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8,5
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2,5


In [112]:
data_modified_v1['Day of week'] = data_modified_v1['Date'].apply(lambda x: x.weekday())

In [113]:
data_modified_v1

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8,5,2
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3,5,2
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8,5,3
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2,5,3


### Map 'Education' into binary, from 1 to 0, and the rest to 1

In [114]:
data_modified_v1['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [117]:
# in this case, 1 represents "high school"
# 2 represents "graduate"
# 3 represents "postgraduate"
# 4 represents "master or docter"

# As we can see here, almost 600 is in high school (not graduated)
# and only 100 in higher than high school
# so, it will have no meaning to separate "master/doctor" from "graduate" or "postgraduate"
# since, the values are very small
# in this case, we will care only who graduate and who does not
data_modified_v1['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [118]:
# 0 for not graduate (high school)
# 1 for graduate (higher than high school)
data_modified_v1['Education'] = data_modified_v1['Education'].map({1:0, 2:1, 3:1, 4:1})

In [119]:
data_modified_v1

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,1,2,0,8,5,2
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,0,1,2,3,5,2
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,1,0,0,8,5,3
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,1,0,0,2,5,3


## Final touch the data

In [121]:
data_modified_v1.drop('Date', axis = 1, inplace = True)

In [124]:
data_modified_v1.columns.values

array(['Reason_Group_1', 'Reason_Group_2', 'Reason_Group_3',
       'Reason_Group_4', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month',
       'Day of week'], dtype=object)

In [125]:
final_rearranged_columns = ['Reason_Group_1', 'Reason_Group_2', 'Reason_Group_3',
       'Reason_Group_4', 'Month',
       'Day of week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [126]:
data_preprocessed = data_modified_v1[final_rearranged_columns]

In [127]:
data_preprocessed

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Month,Day of week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


In [128]:
data_preprocessed.to_csv('Absenteeism_preprocessed.csv', index = False)