## Import libraries

In [83]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

## Load data and prepare it for ML

In [84]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [85]:
data_preprocessed.head()

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Month,Day of week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [86]:
# we will use Absenteeism Time in Hours as a target to tell whether someone has Absenteeism
# We will use logistic regression, so we need to convert them into 1 and 0
# in this case, we decide to refer the values less than its median to be moderately absenteeism and set to 0
# and values over its median to be excessively absenteeism and set to 1

data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [87]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']
                   > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [88]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [89]:
data_preprocessed['Excessive Absenteeism'] = targets

In [90]:
# Check the balanced class
targets.sum() / targets.shape[0]

# the ideal is 50-50, but 55-45 is actually ok
# 60-40 is still ok for logistic regression, but not ok for some other algorithms

0.45571428571428574

In [91]:
# Backward elimination
## Go down to the end to see why we drop some of these columns
### Basically, to the ends, we found they're useless so we comeback and delete them
unscaled_data = data_preprocessed.drop(['Absenteeism Time in Hours','Day of week',
                                        'Daily Work Load Average','Distance to Work'], axis = 1)

### Inputs and targets selection

In [92]:
# we already have targets
unscaled_inputs = unscaled_data.iloc[:,:-1]

In [93]:
unscaled_inputs

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


### Feature scaling

In [94]:
# Create custom scaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [95]:
omitted_columns = ['Reason_Group_1','Reason_Group_2','Reason_Group_3','Reason_Group_4','Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in omitted_columns]

In [96]:
custom_scaler = CustomScaler(columns_to_scale)



In [97]:
custom_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [98]:
scaled_inputs = custom_scaler.transform(unscaled_inputs)

In [99]:
scaled_inputs

Unnamed: 0,Reason_Group_1,Reason_Group_2,Reason_Group_3,Reason_Group_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


## Another way to do specific scaling
Since it's harder to save ColumnTransformer with pickle,  
we go for creating a custom scaler method above

In [100]:
# we don't need to Scale Dummies columns
# use ColumnTransformer to scale specific columns
# ct = ColumnTransformer([
#     ('scaled', StandardScaler(), unscaled_inputs.drop(['Reason_Group_1','Reason_Group_2','Reason_Group_3',
#                                                        'Reason_Group_4','Education'], axis = 1).columns.values)
# ], remainder = 'passthrough')

In [101]:
# ct.fit_transform(unscaled_inputs)

In [102]:
# unscaled_inputs.columns.values

In [103]:
# we need this because ColumnTransformer will put the columns that 'passthrough' (unscaled) to the end of the dataframe
# new_ordered_columns = [ 'Month', 'Day of week', 'Transportation Expense',
#        'Distance to Work', 'Age', 'Daily Work Load Average',
#        'Body Mass Index', 'Children', 'Pets', 'Reason_Group_1', 'Reason_Group_2', 'Reason_Group_3',
#        'Reason_Group_4', 'Education']

In [104]:
# scaled_inputs = pd.DataFrame(columns = new_ordered_columns, data = ct.fit_transform(unscaled_inputs))

In [105]:
# scaled_inputs

In [106]:
# scaled_inputs.columns.values

In [107]:
# rearranged_back = ['Reason_Group_1',
#                    'Reason_Group_2', 'Reason_Group_3', 'Reason_Group_4', 'Education', 
#                    'Month', 'Day of week', 'Transportation Expense',
#                    'Distance to Work', 'Age', 'Daily Work Load Average',
#                    'Body Mass Index', 'Children', 'Pets']

In [108]:
# scaled_inputs = scaled_inputs[rearranged_back]

In [109]:
# scaled_inputs

In [110]:
# verify correctness after arranging
# np.sum(unscaled_inputs['Education'] == scaled_inputs['Education'])

In [111]:
# np.sum(unscaled_inputs['Reason_Group_1'] == scaled_inputs['Reason_Group_1'])

In [112]:
# np.sum(unscaled_inputs['Reason_Group_2'] == scaled_inputs['Reason_Group_2'])

In [113]:
# np.sum(unscaled_inputs['Reason_Group_3'] == scaled_inputs['Reason_Group_3'])

In [114]:
# np.sum(unscaled_inputs['Reason_Group_4'] == scaled_inputs['Reason_Group_4'])

### Train-test split

In [115]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state = 20)

In [116]:
X_train.shape, X_test.shape

((560, 11), (140, 11))

In [117]:
y_train.shape, y_test.shape

((560,), (140,))

## Build a model !

In [118]:
reg = LogisticRegression()

In [119]:
reg.fit(X_train, y_train)

LogisticRegression()

In [120]:
reg.score(X_train, y_train)

0.7732142857142857

### Let's mannually check the accuracy

In [121]:
prediction = reg.predict(X_train)

In [122]:
np.sum(prediction == y_train) / X_train.shape[0]

0.7732142857142857

### Find the coefficient and intercepts (weights and bias)

In [123]:
reg.intercept_

array([-1.6474549])

In [124]:
reg.coef_

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])

In [125]:
# change to nice form
unscaled_inputs.columns.values

array(['Reason_Group_1', 'Reason_Group_2', 'Reason_Group_3',
       'Reason_Group_4', 'Month', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [126]:
summary_table = pd.DataFrame(data = {'Feature Name':scaled_inputs.columns.values,
                                     'Coefficients':reg.coef_[0]})

In [127]:
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Reason_Group_1,2.800197
1,Reason_Group_2,0.951884
2,Reason_Group_3,3.115553
3,Reason_Group_4,0.839001
4,Month,0.15893
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [128]:
# insert intercept on top
summary_table.index = summary_table.index + 1

In [129]:
summary_table

Unnamed: 0,Feature Name,Coefficients
1,Reason_Group_1,2.800197
2,Reason_Group_2,0.951884
3,Reason_Group_3,3.115553
4,Reason_Group_4,0.839001
5,Month,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533
10,Children,0.348262


In [130]:
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]

In [131]:
summary_table

Unnamed: 0,Feature Name,Coefficients
1,Reason_Group_1,2.800197
2,Reason_Group_2,0.951884
3,Reason_Group_3,3.115553
4,Reason_Group_4,0.839001
5,Month,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533
10,Children,0.348262


In [132]:
summary_table = summary_table.sort_index()

In [133]:
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-1.647455
1,Reason_Group_1,2.800197
2,Reason_Group_2,0.951884
3,Reason_Group_3,3.115553
4,Reason_Group_4,0.839001
5,Month,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533


In [134]:
summary_table.sort_values('Coefficients', ascending = False)

Unnamed: 0,Feature Name,Coefficients
3,Reason_Group_3,3.115553
1,Reason_Group_1,2.800197
2,Reason_Group_2,0.951884
4,Reason_Group_4,0.839001
6,Transportation Expense,0.605284
10,Children,0.348262
8,Body Mass Index,0.279811
5,Month,0.15893
7,Age,-0.169891
9,Education,-0.210533


In [135]:
summary_table['Odds Ratio'] = np.exp(summary_table['Coefficients'])

In [136]:
# odds ratio tell how 1 unit of that column change, will affect multiple of the liklihood
# ex. if a person has a reason_group_3, he/she will be 22 time likely to be excessive absenteeism
# it's just an easier way to intepret of coefficients
# note that coef/weight = 0 means that feature is useless (contribute nothing)
# the same way with Odds ratio = 1

summary_table.sort_values('Coefficients', ascending = False)

Unnamed: 0,Feature Name,Coefficients,Odds Ratio
3,Reason_Group_3,3.115553,22.545903
1,Reason_Group_1,2.800197,16.447892
2,Reason_Group_2,0.951884,2.590585
4,Reason_Group_4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


### Backward Elimination 
From this point we see that 'Daily workload avg', 'Distance', 'Day of week' are useless.  
So, we will do backward elimination which is going back and delete useless features and train again  
if we do correctly coefficients won't change much  
<b>Note that</b> in the end, we get the same accuracy after backward elimination  
but it's always better to keep model simple, so we drop them anyways

## Test the model !

In [138]:
reg.score(X_test, y_test)

0.75

In [140]:
# the predicted liklihood for each person to be excessive absenteeism
reg.predict_proba(X_test)[:,1]

array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174

## Save the model

In [137]:
import pickle

In [141]:
with open('logistic_model', 'wb') as file:
    pickle.dump(reg, file)

In [142]:
with open('custom_scaler', 'wb') as file:
    pickle.dump(custom_scaler, file)