# Creating a logistic regression to predict absenteeism

## Import the relevant libraries

In [223]:
import pandas as pd
import numpy as np

## Load the data

In [224]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [225]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [226]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [227]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [228]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [229]:
data_preprocessed['Excessive Absenteeism'] = targets

In [230]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


## A comment on the targets

In [231]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [232]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)

In [233]:
data_with_targets is data_preprocessed

False

In [234]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## Select the inputs for the regression

In [235]:
data_with_targets.shape

(700, 12)

In [236]:
data_with_targets.iloc[:,:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [237]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [238]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data

In [239]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()

In [240]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        if self.with_mean:
            self.mean_ = np.mean(X, axis=0)
        if self.with_std:
            self.std_ = np.std(X, axis=0)
        return self

    def transform(self, X):
        X_transformed = X
        if self.with_mean:
            X_transformed = X_transformed - self.mean_
        if self.with_std:
            X_transformed = X_transformed / self.std_
        return X_transformed


In [241]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [242]:
#columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [243]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [244]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [245]:
absenteeism_scaler.fit(unscaled_inputs)

In [246]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [247]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,-0.577350,-0.092981,-0.314485,0.821365,0.030796,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,0.030796,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,0.030796,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.030796,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,0.030796,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,-0.314485,-1.217485,-0.568019,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1.732051,-0.092981,-0.314485,-1.217485,-0.568019,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1.732051,-0.092981,-0.314485,-1.217485,-0.568019,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,-0.577350,-0.092981,-0.314485,0.821365,-0.568019,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


In [248]:
scaled_inputs.shape

(700, 11)

## Split the data into train & test and shuffle

### Import the relevant module

In [249]:
from sklearn.model_selection import train_test_split

### Split

In [250]:
train_test_split(scaled_inputs, targets)

[     Reason_1   Reason_2  Reason_3  Reason_4  Month Value  \
 60  -0.577350  -0.092981 -0.314485  0.821365     0.629611   
 264 -0.577350  -0.092981 -0.314485  0.821365     0.330204   
 320 -0.577350  -0.092981 -0.314485  0.821365     1.228426   
 444 -0.577350  -0.092981 -0.314485  0.821365     0.330204   
 668 -0.577350  10.754844 -0.314485 -1.217485    -0.867426   
 ..        ...        ...       ...       ...          ...   
 276 -0.577350  -0.092981 -0.314485 -1.217485     0.629611   
 0   -0.577350  -0.092981 -0.314485  0.821365     0.030796   
 493 -0.577350  -0.092981 -0.314485  0.821365     0.330204   
 127 -0.577350  -0.092981 -0.314485  0.821365     1.228426   
 565  1.732051  -0.092981 -0.314485 -1.217485     1.527833   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 60                -0.654143  0.248310         1.002633  -0.447980 -0.919030   
 264                0.387122  1.660180         1.237836  -0.447980  0.880469   
 320          

In [251]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [252]:
print (x_train.shape, y_train.shape)

(560, 11) (560,)


In [253]:
print (x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic regression with sklearn

In [254]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [255]:
reg = LogisticRegression()

In [256]:
reg.fit(x_train,y_train)

In [257]:
reg.score(x_train,y_train)

0.7857142857142857

### Manually check the accuracy

In [258]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [259]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [260]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [261]:
np.sum((model_outputs==y_train))

440

In [262]:
model_outputs.shape[0]

560

In [263]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7857142857142857

### Finding the intercept and coefficients

In [264]:
reg.intercept_

array([-0.21805902])

In [265]:
reg.coef_

array([[ 2.05375976,  0.32940745,  1.55175598,  1.295456  ,  0.01975052,
         0.71290575, -0.20310607,  0.33288059, -0.1389948 ,  0.37910854,
        -0.31894076]])

In [266]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [267]:
feature_name = unscaled_inputs.columns.values

In [268]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.05376
1,Reason_2,0.329407
2,Reason_3,1.551756
3,Reason_4,1.295456
4,Month Value,0.019751
5,Transportation Expense,0.712906
6,Age,-0.203106
7,Body Mass Index,0.332881
8,Education,-0.138995
9,Children,0.379109


In [269]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.218059
1,Reason_1,2.05376
2,Reason_2,0.329407
3,Reason_3,1.551756
4,Reason_4,1.295456
5,Month Value,0.019751
6,Transportation Expense,0.712906
7,Age,-0.203106
8,Body Mass Index,0.332881
9,Education,-0.138995


## Interpreting the coefficients

In [270]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [271]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.218059,0.804078
1,Reason_1,2.05376,7.797162
2,Reason_2,0.329407,1.390144
3,Reason_3,1.551756,4.719751
4,Reason_4,1.295456,3.652661
5,Month Value,0.019751,1.019947
6,Transportation Expense,0.712906,2.03991
7,Age,-0.203106,0.816192
8,Body Mass Index,0.332881,1.394981
9,Education,-0.138995,0.870233


In [272]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,2.05376,7.797162
3,Reason_3,1.551756,4.719751
4,Reason_4,1.295456,3.652661
6,Transportation Expense,0.712906,2.03991
10,Children,0.379109,1.460982
8,Body Mass Index,0.332881,1.394981
2,Reason_2,0.329407,1.390144
5,Month Value,0.019751,1.019947
9,Education,-0.138995,0.870233
7,Age,-0.203106,0.816192


## Testing the model

In [273]:
reg.score(x_test,y_test)

0.7285714285714285

In [274]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.75419772, 0.24580228],
       [0.59585028, 0.40414972],
       [0.4459363 , 0.5540637 ],
       [0.76179054, 0.23820946],
       [0.06685395, 0.93314605],
       [0.28106341, 0.71893659],
       [0.29012732, 0.70987268],
       [0.07130711, 0.92869289],
       [0.7464439 , 0.2535561 ],
       [0.75747164, 0.24252836],
       [0.47933931, 0.52066069],
       [0.15452665, 0.84547335],
       [0.03565258, 0.96434742],
       [0.72907287, 0.27092713],
       [0.22721519, 0.77278481],
       [0.50950968, 0.49049032],
       [0.47786365, 0.52213635],
       [0.48524527, 0.51475473],
       [0.36570263, 0.63429737],
       [0.0340704 , 0.9659296 ],
       [0.74194102, 0.25805898],
       [0.76179054, 0.23820946],
       [0.47413086, 0.52586914],
       [0.46823712, 0.53176288],
       [0.15662496, 0.84337504],
       [0.74756148, 0.25243852],
       [0.49018941, 0.50981059],
       [0.8993592 , 0.1006408 ],
       [0.16217057, 0.83782943],
       [0.76179054, 0.23820946],
       [0.

In [275]:
predicted_proba.shape

(140, 2)

In [276]:
predicted_proba[:,1]

array([0.24580228, 0.40414972, 0.5540637 , 0.23820946, 0.93314605,
       0.71893659, 0.70987268, 0.92869289, 0.2535561 , 0.24252836,
       0.52066069, 0.84547335, 0.96434742, 0.27092713, 0.77278481,
       0.49049032, 0.52213635, 0.51475473, 0.63429737, 0.9659296 ,
       0.25805898, 0.23820946, 0.52586914, 0.53176288, 0.84337504,
       0.25243852, 0.50981059, 0.1006408 , 0.83782943, 0.23820946,
       0.39846673, 0.73072911, 0.72722468, 0.52361162, 0.23820946,
       0.63729175, 0.25467694, 0.84709916, 0.45542601, 0.63049413,
       0.23713803, 0.47201318, 0.25021316, 0.10842763, 0.83589641,
       0.68630226, 0.73514437, 0.24036228, 0.2455922 , 0.23606993,
       0.48381645, 0.06564283, 0.71893659, 0.24015537, 0.847291  ,
       0.40700098, 0.94692247, 0.25223161, 0.08047104, 0.08090969,
       0.71143335, 0.72250736, 0.25559231, 0.84724115, 0.23446034,
       0.24470767, 0.01192714, 0.25580104, 0.83847784, 0.2756246 ,
       0.24800092, 0.07771159, 0.91090465, 0.45249436, 0.63180

## Save the model

In [277]:
import pickle

In [278]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [279]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)