# Creating a Logistic Regression to predict Absenteeism

## Import the relevant libraries

In [2]:
import pandas as pd
import numpy as np

## Load the data

In [3]:
data_preprocessed = pd.read_csv('absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


## A comment on the targets

In [7]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [80]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [81]:
data_with_targets is data_preprocessed

False

In [82]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## Select the inputs for the regression

In [83]:
data_with_targets.shape

(700, 12)

In [84]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
5,0,0,0,1,10,179,38,31,0,0,0
6,0,0,0,1,7,361,28,27,0,1,4
7,0,0,0,1,7,260,36,23,0,4,0
8,0,0,1,0,6,155,34,25,0,2,0
9,0,0,0,1,7,235,37,29,1,1,1


In [85]:
unscaled_inputs = data_with_targets.iloc[:,:-1] 

## Standardize the data

In [86]:
# from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()

In [87]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [88]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [93]:
columns_to_scale = ['Month Value','Transportation Expense',
       'Age', 'Body Mass Index', 'Education', 'Children', 'Pet']

In [94]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [95]:
absenteeism_scaler.fit(unscaled_inputs)

  return self.partial_fit(X, y)


CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age', 'Body Mass Index', 'Education', 'Children', 'Pet'],
       copy=None, with_mean=None, with_std=None)

In [96]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)



In [97]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
5,0,0,0,1,0.929019,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
6,0,0,0,1,0.030796,2.092381,-1.320435,0.061825,-0.447980,-0.019280,2.843016
7,0,0,0,1,0.030796,0.568211,-0.065439,-0.878984,-0.447980,2.679969,-0.589690
8,0,0,1,0,-0.268611,-1.016322,-0.379188,-0.408580,-0.447980,0.880469,-0.589690
9,0,0,0,1,0.030796,0.190942,0.091435,0.532229,2.232242,-0.019280,0.268487


In [98]:
scaled_inputs.shape

(700, 11)

## Split the data into train & test and shuffle

### Import the relevant module

In [99]:
from sklearn.model_selection import train_test_split

### Split

In [100]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)



In [101]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [102]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic Regression with sklearn

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [104]:
reg = LogisticRegression()

In [105]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [106]:
reg.score(x_train, y_train)

0.775

### Manually check the accuracy

In [107]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [108]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [109]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [110]:
np.sum((model_outputs==y_train))

434

In [111]:
model_outputs.shape[0]

560

In [112]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.775

## Finding the intercepts

In [113]:
reg.intercept_

array([-1.47072937])

In [114]:
reg.coef_

array([[ 2.59769035,  0.84084545,  2.93637494,  0.63263326,  0.00558681,
         0.61934164, -0.17640849,  0.2842074 , -0.09802627,  0.35192912,
        -0.27364049]])

In [115]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [116]:
feature_names = unscaled_inputs.columns.values

In [117]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_names)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.59769
1,Reason_2,0.840845
2,Reason_3,2.936375
3,Reason_4,0.632633
4,Month Value,0.005587
5,Transportation Expense,0.619342
6,Age,-0.176408
7,Body Mass Index,0.284207
8,Education,-0.098026
9,Children,0.351929


In [118]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.470729
1,Reason_1,2.59769
2,Reason_2,0.840845
3,Reason_3,2.936375
4,Reason_4,0.632633
5,Month Value,0.005587
6,Transportation Expense,0.619342
7,Age,-0.176408
8,Body Mass Index,0.284207
9,Education,-0.098026


## Interpreting the Coefficients

In [119]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [120]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.470729,0.229758
1,Reason_1,2.59769,13.432677
2,Reason_2,0.840845,2.318326
3,Reason_3,2.936375,18.847399
4,Reason_4,0.632633,1.882561
5,Month Value,0.005587,1.005602
6,Transportation Expense,0.619342,1.857705
7,Age,-0.176408,0.838275
8,Body Mass Index,0.284207,1.328708
9,Education,-0.098026,0.906625


In [121]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.936375,18.847399
1,Reason_1,2.59769,13.432677
2,Reason_2,0.840845,2.318326
4,Reason_4,0.632633,1.882561
6,Transportation Expense,0.619342,1.857705
10,Children,0.351929,1.421808
8,Body Mass Index,0.284207,1.328708
5,Month Value,0.005587,1.005602
9,Education,-0.098026,0.906625
7,Age,-0.176408,0.838275


Feature isn't important if coefficient is around 0 or odds ratio is around 1.

A weight of 0 implies that no matter the feature value, we will multiply it by 0 (in the model)

For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (1 = no change)

* Reason 0 = No reason
* Reason 1 = Various diseases
* Reason 2 = pregnancy and giving birth
* Reason 3 = poisoning
* Reason 4 = light diseases

## Test the Model

In [122]:
reg.score(x_test, y_test)

0.7357142857142858

In [123]:
predict_prob = reg.predict_proba(x_test)
predict_prob

array([[0.75306909, 0.24693091],
       [0.60933987, 0.39066013],
       [0.48594824, 0.51405176],
       [0.75524002, 0.24475998],
       [0.08399874, 0.91600126],
       [0.30189304, 0.69810696],
       [0.30166553, 0.69833447],
       [0.11508708, 0.88491292],
       [0.73775515, 0.26224485],
       [0.75400107, 0.24599893],
       [0.50725162, 0.49274838],
       [0.19728599, 0.80271401],
       [0.06163655, 0.93836345],
       [0.70895125, 0.29104875],
       [0.29283186, 0.70716814],
       [0.52394081, 0.47605919],
       [0.50683352, 0.49316648],
       [0.50892391, 0.49107609],
       [0.36710468, 0.63289532],
       [0.06355852, 0.93644148],
       [0.73645858, 0.26354142],
       [0.75524002, 0.24475998],
       [0.47452254, 0.52547746],
       [0.47285445, 0.52714555],
       [0.22021908, 0.77978092],
       [0.73807864, 0.26192136],
       [0.51193081, 0.48806919],
       [0.87673721, 0.12326279],
       [0.23439205, 0.76560795],
       [0.75524002, 0.24475998],
       [0.

In [125]:
predict_prob.shape

(140, 2)

In [126]:
predict_prob[:,1]

array([0.24693091, 0.39066013, 0.51405176, 0.24475998, 0.91600126,
       0.69810696, 0.69833447, 0.88491292, 0.26224485, 0.24599893,
       0.49274838, 0.80271401, 0.93836345, 0.29104875, 0.70716814,
       0.47605919, 0.49316648, 0.49107609, 0.63289532, 0.93644148,
       0.26354142, 0.24475998, 0.52547746, 0.52714555, 0.77978092,
       0.26192136, 0.48806919, 0.12326279, 0.76560795, 0.24475998,
       0.38906856, 0.7158631 , 0.70056896, 0.49358459, 0.24475998,
       0.59541198, 0.26256861, 0.78565518, 0.43686254, 0.604478  ,
       0.2444509 , 0.49911937, 0.26127513, 0.44806935, 0.80688538,
       0.6084217 , 0.72250109, 0.24537893, 0.24849891, 0.24414209,
       0.50246481, 0.32700139, 0.69810696, 0.24694031, 0.82061487,
       0.39145679, 0.90718611, 0.26701353, 0.35898854, 0.35937355,
       0.70417845, 0.69916351, 0.26799682, 0.78121379, 0.24619926,
       0.24661998, 0.0906395 , 0.26289262, 0.76741323, 0.29243128,
       0.26062994, 0.35373347, 0.88318587, 0.43603969, 0.59379