# Creating a Logistic Regression to predict absenteeism

In [1]:
import numpy as np
import pandas as pd

# Load the Data

In [2]:
data = pd.read_csv('preprocessed.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


# create the targets

In [4]:
data['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data['Absenteeism Time in Hours'] > 3, 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
data['Excessive Absenteeism'] = targets

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [9]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [10]:
data_with_targets = data.drop(['Absenteeism Time in Hours'], axis=1)

In [11]:
data_with_targets is data

False

In [12]:
data_with_targets.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1,0
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,7,2,0
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,7,3,1
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,7,3,0


In [13]:
data_with_targets = data_with_targets.drop(['Date'], axis=1)

In [14]:
data_with_targets = data_with_targets.drop(['Unnamed: 0'], axis=1)

In [15]:
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2,1
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2,0
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3,1
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3,0


# Select the Inputs for the regression

In [16]:
data_with_targets.shape

(700, 15)

In [17]:
data_with_targets.iloc[:, 0:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [18]:
unscaled_inputs = data_with_targets.iloc[:, 0:-1]

# Standardize the Data

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [21]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [22]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.00772546],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
        -0.3882935 ,  0.66825259]])

In [23]:
scaled_inputs.shape

(700, 14)

# Split the data into train & test and shuffle

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
train_test_split(scaled_inputs, targets)

[array([[ 1.73205081, -0.09298136, -0.31448545, ...,  0.26848661,
         -1.53033319, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -1.53033319, -0.00772546],
        [-0.57735027, -0.09298136,  3.17979734, ..., -0.58968976,
          1.61027597, -0.68370352],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          0.18272635, -0.68370352],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
          1.03925612,  2.69618675],
        [-0.57735027, -0.09298136, -0.31448545, ...,  2.8430157 ,
          0.7537462 , -0.68370352]]),
 array([[ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.95931334, -0.68370352],
        [ 1.73205081, -0.09298136, -0.31448545, ...,  0.26848661,
          1.03925612,  2.69618675],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.67380342,  0.66825259],
        ...,
        [-0.57735027, -0.09298136,  3.17979734, ..., -

In [26]:
x_train, x_test, y_train ,y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [27]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


# Logistic Regression with sklearn

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [29]:
log_reg = LogisticRegression()

In [30]:
log_reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
log_reg.score(x_train, y_train)

0.7839285714285714

# Finding the intercept and coefficients

In [32]:
log_reg.intercept_

array([-0.22206736])

In [33]:
log_reg.coef_

array([[ 2.07601767,  0.33504757,  1.56162303,  1.32927434,  0.70639316,
        -0.03986811, -0.20089491, -0.00456366,  0.31933564, -0.135508  ,
         0.38172443, -0.3332426 ,  0.18793677, -0.07062253]])

In [34]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of the Week'], dtype=object)

In [35]:
feature_name = unscaled_inputs.columns.values

In [36]:
summary_table = pd.DataFrame(feature_name, columns=['Feature name'])
summary_table['Coefficient'] = np.transpose(log_reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.076018
1,Reason_2,0.335048
2,Reason_3,1.561623
3,Reason_4,1.329274
4,Transportation Expense,0.706393
5,Distance to Work,-0.039868
6,Age,-0.200895
7,Daily Work Load Average,-0.004564
8,Body Mass Index,0.319336
9,Education,-0.135508


In [37]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', log_reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.222067
1,Reason_1,2.076018
2,Reason_2,0.335048
3,Reason_3,1.561623
4,Reason_4,1.329274
5,Transportation Expense,0.706393
6,Distance to Work,-0.039868
7,Age,-0.200895
8,Daily Work Load Average,-0.004564
9,Body Mass Index,0.319336


# Interpreting the Coefficients

In [38]:
#the intercept and the coefficients are scaled together with inputs
#it is preferable to not use the dummy variables in scaled format

# Testing the models

In [39]:
log_reg.score(x_test, y_test)

0.7357142857142858

# Save the model

In [40]:
import pickle

In [41]:
with open('model', 'wb') as file:
    pickle.dump(log_reg, file)
#file name is model, wb write in bytes

In [42]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)

# Deploying the model