# Absenteeism Module

## Import The Relevant Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Load The Dataset

In [2]:
raw_csv_data = pd.read_csv('Absenteeism-data.csv')
raw_csv_data

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
695,17,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,28,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,18,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,25,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


## Data Preprocessing 

In [3]:
data = raw_csv_data
data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Drop the ID column 

In [4]:
data = data.drop(['ID'], axis = 1)

### Convert  Reason for Absence Column to Dummy Variable

In [5]:
reason = data['Reason for Absence']
len(reason)

700

In [6]:
# Check all unique value
reason.unique()

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16], dtype=int64)

In [7]:
len(reason.unique())

28

In [8]:
# Sorted the value 
sorted(reason.unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

In [9]:
reason = pd.get_dummies(reason)

In [10]:
reason

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# Check for missing value
check = reason.sum(axis =1)
check

0      1
1      1
2      1
3      1
4      1
      ..
695    1
696    1
697    1
698    1
699    1
Length: 700, dtype: int64

The check should be contain valu 1 because each person only have 1 type of reason, if there is a 0 then it means that some values are missing

In [12]:
check == 0

0      False
1      False
2      False
3      False
4      False
       ...  
695    False
696    False
697    False
698    False
699    False
Length: 700, dtype: bool

### Grouping and join the reson column.
Grouping the reason_columns so there will be too many dummy varibles, The group was based qualitative analysis. There will be 4 groups, .loc[] funtion is used to split the df and based on label, the 1st : is indicated row and the 2nd indicated columns, .max() is used to return the highest value to find if each data have the particular type of reason


In [13]:
reason_type_1 = reason.loc[:, 1:14].max(axis = 1)
reason_type_2 = reason.loc[:, 15:17].max(axis = 1)
reason_type_3 = reason.loc[:, 18:21].max(axis = 1)
reason_type_4 = reason.loc[:, 22:].max(axis = 1)

### Combine our data with the reason types

In [14]:
data = pd.concat([data, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
data

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


### Drop the reason for absence to avoid multicolinearity

In [15]:
data = data.drop(['Reason for Absence'], axis = 1)
data.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


### Rename and reoreder the column

In [16]:
data.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [17]:
col_name = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1','Reason_2','Reason_3','Reason_4']

In [18]:
# Rename the columns
data.columns = col_name
data.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [19]:
# Re-order the columns
col_name = [ 'Reason_1','Reason_2','Reason_3','Reason_4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
data = data[col_name]
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Get Month and Date of the week data

In [20]:
data['Date'] = pd.to_datetime(data['Date'], format = '%d/%m/%Y')
data['Date'].dtypes

dtype('<M8[ns]')

In [21]:
type(data['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

#### Extract the month value

In [22]:
months = []

for i in range(0, len(data)):
    months.append(data['Date'][i].month)
months

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 

#### Extract the weekdays value

In [23]:
weekdays= []

for i in range(0, len(data)):
    weekdays.append(data['Date'][i].weekday())
weekdays

[1,
 1,
 2,
 3,
 3,
 4,
 4,
 4,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 4,
 4,
 0,
 3,
 2,
 2,
 0,
 0,
 4,
 0,
 0,
 1,
 2,
 2,
 4,
 0,
 3,
 3,
 0,
 0,
 0,
 1,
 3,
 4,
 4,
 1,
 0,
 1,
 1,
 2,
 6,
 0,
 3,
 4,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 4,
 4,
 4,
 1,
 1,
 2,
 3,
 4,
 4,
 1,
 2,
 2,
 3,
 2,
 2,
 3,
 1,
 1,
 2,
 4,
 4,
 3,
 2,
 3,
 3,
 3,
 0,
 0,
 0,
 2,
 4,
 2,
 2,
 4,
 4,
 0,
 1,
 2,
 3,
 1,
 1,
 2,
 2,
 3,
 4,
 1,
 2,
 3,
 4,
 1,
 2,
 4,
 4,
 4,
 2,
 0,
 1,
 1,
 2,
 3,
 3,
 4,
 0,
 1,
 1,
 2,
 3,
 4,
 0,
 1,
 1,
 1,
 2,
 3,
 3,
 0,
 1,
 3,
 4,
 2,
 2,
 3,
 3,
 4,
 4,
 0,
 1,
 0,
 1,
 1,
 1,
 3,
 0,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 2,
 2,
 3,
 4,
 4,
 0,
 0,
 0,
 1,
 3,
 3,
 4,
 4,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 3,
 4,
 4,
 0,
 2,
 2,
 1,
 2,
 3,
 4,
 4,
 2,
 3,
 4,
 4,
 4,
 4,
 4,
 1,
 2,
 2,
 3,
 4,
 0,
 2,
 3,
 0,
 1,
 1,
 2,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 4,
 0,
 0,
 1,
 3,
 4,
 0,
 0,
 0,
 2,
 4,
 1,
 2,
 3,
 4,
 0,
 3,
 1,
 2,
 4,
 4,
 1,
 3,
 4,
 0,
 1,
 0,
 1,
 2,
 3,
 0,


#### Combine the months and weekdays to data

In [24]:
data['Month Value'] = months
data['Date of the week '] = weekdays
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Date of the week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


#### Drop the date column and reorder the columns

In [25]:
# Drop date column
data = data.drop(['Date'], axis  = 1)

In [26]:
data.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
       'Date of the week '], dtype=object)

In [27]:
col_name = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Date of the week ', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
data = data[col_name]

In [28]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Date of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2


### Converting Education Column

In [29]:
# Check education column and data
data['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [30]:
data['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

#### Grouping The Education Column to 0 and 1 Value 

In [31]:
data['Education'] = data['Education'].map({1:0, 2:1, 3:1, 4:1})
data['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

In [32]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Date of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the Model

### Create Target 

They have to be categories and we must find a way to say if someone is 'being absent too much' or not, what we've decided to do is to take the median of the dataset as a cut-off line.In this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression). As balancing is a great problem for ML, this will work great for us, alternatively, if we had more data, we could have found other ways to deal with the issue ,for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median


In [33]:
targets = np.where(data['Absenteeism Time in Hours'] > 
                   data['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

#### Check the targets 

In [34]:
# check if dataset is balanced (what % of targets are 1s)
targets.sum()/targets.shape[0]

0.45571428571428574

In [35]:
data['Ecessive absenteeism'] = targets
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Date of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Ecessive absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [36]:
# Drop the absenteeism TIme in Hours column
data = data.drop(['Absenteeism Time in Hours'], axis = 1)

In [37]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Date of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Ecessive absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


### Select Inputs

In [38]:
inputs = data.iloc[:, :-1]
inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Date of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


### Standardize the Inputs

Standardization is one of the most common preprocessing tools, since data of different magnitude (scale) can be biased towards high values,we want all inputs to be of similar magnitude, this is a peculiarity of machine learning in general - most (but not all) algorithms do badly with unscaled data

In [39]:
# define scaler as an object
scaler = StandardScaler()

dummies = inputs[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']]
to_scale = inputs[['Month Value',
       'Date of the week ', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets']]

dummies = dummies.to_numpy()
scaler.fit(to_scale)

scaled_inputs_no_dummies = scaler.transform(to_scale)
 
scaled_inputs = []
for i in range(dummies.shape[0]):
    scaled_inputs.append(np.concatenate((scaled_inputs_no_dummies[i], dummies[i])))
scaled_intputs

col_name = ['Month Value',
       'Date of the week ', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets','Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education'] 
inputs = inputs[col_name]
inputs[col_name] = scaled_inputs
inputs

NameError: name 'scaled_intputs' is not defined

### Split the dataset into training and testing set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, train_size = 0.8, test_size = 0.2, random_state =12) 

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

### Train the model

In [None]:
reg = LogisticRegression()

In [None]:
#Fit the data 
reg.fit(x_train, y_train)

In [None]:
#Check the accuracy
reg.score(x_train, y_train)

In [None]:
# Check the accuracy manually
model_outputs = reg.predict(x_train)
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

### Get Coefiecient and Intercept

In [None]:
# Get Intercepct 
reg.intercept_

In [None]:
# Get Coeficient
reg.coef_

In [None]:
# Make summary table for coeficiet
feature_name = inputs.columns.values

summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)
summary_table['Coeficient'] = np.transpose(reg.coef_)
summary_table

In [None]:
# Input intercept to summary table
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

### Interpreting coeficient value

In [None]:
# Make odds Ratio
summary_table['odds_ratio'] = np.exp(summary_table.Coeficient) 
summary_table.sort_values('odds_ratio', ascending = False)

### Test the model

In [None]:
# Cek the test accuracy
reg.score(x_test, y_test)

In [None]:
pred_proba = reg.predict_proba(x_test)
pred_proba

In [None]:
# Get probability if someone excessifly absent
pred_proba[:, 1]