# Creating a logistic regression to predict absenteeism

## Import the relevant libraries

In [1]:
# import pandas and numpy
import pandas as pd
import numpy as np

## Load the data

In [2]:
# load the preprocessed csv data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

# brief overview of the data
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of Week,Month,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,1,7,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,1,7,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,2,7,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,3,7,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,3,7,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [3]:
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

# create targets for the logistic regression using the median of 'Absenteeism Time in Hours'
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

# create a column in the dataframe for the targets
data_preprocessed['Excessive Absenteeism'] = targets

# create a checkpoint by dropping 'Absenteeism Time in Hours'
# also drop the variables with very small weights after exploring the coefficients (backward elimination)
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 
                                            'Day of Week', 
                                            'Daily Work Load Average', 
                                            'Distance to Work'
                                           ],axis=1)

## Select the inputs for the regression

In [4]:
# Create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data

In [5]:
# import the libraries needed to create the Custom Scaler (365 Data Science, 'The Data Science Course', 2021)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class based on StandardScaler
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # note the inclusion of a 'columns' parameter to choose which features to scale   
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [6]:
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)

# fit the data
absenteeism_scaler.fit(unscaled_inputs)

# standardise the data using the transform method
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

# view the dataframe with the scaled inputs
scaled_inputs



Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


## Training the model

In [7]:
# import train_test_split
# import the LogisticRegression model and metrics module from sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 8)

# create a logistic regression object
reg = LogisticRegression()

# fit the train inputs
reg.fit(x_train,y_train)

# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7625

### Finding the intercept and coefficients

In [8]:
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values

# create a summary table showing the features and their coefficients
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)

# add the intercept at index 0
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# add a new column showing the odd ratio of each feature (e^coefficient)
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

# sort the table according to odds ratio
summary_table.sort_values('Odds_ratio', ascending=False)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,2.75514,15.723246
2,Reason_2,0.878663,2.407678
3,Reason_3,3.073006,21.606759
4,Reason_4,0.900073,2.459783
5,Month,0.123995,1.13201
6,Transportation Expense,0.572999,1.773579
7,Age,-0.264644,0.767479
8,Body Mass Index,0.250265,1.284365
9,Education,-0.114178,0.892099
10,Children,0.487177,1.627714


## Testing the model

In [9]:
# assess the test accuracy of the model
reg.score(x_test,y_test)

0.75

## Save the model

In [10]:
# import the pickle module
import pickle

# pickle the model file
with open('model', 'wb') as file:
    pickle.dump(reg, file)

# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)