In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

In [29]:
data_preprocessed = pd.read_csv('df_preprocessed.csv')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [30]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [31]:
data_preprocessed['Absenteeism Time in Hours'].unique()

array([  4,   0,   2,   8,  40,   1,   7,   3,  32,   5,  16,  24,  64,
        56,  80, 120, 112, 104,  48], dtype=int64)

In [32]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [33]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [34]:
data_preprocessed['Excessive Absenteeism'] = targets

In [35]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [36]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [37]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [38]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [39]:
data_with_targets.shape

(700, 15)

In [40]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [41]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [42]:
unscaled_inputs_without_dummy = unscaled_inputs[['Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets']]

In [43]:
unscaled_inputs_with_dummy = unscaled_inputs.drop(['Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets'], axis=1)

In [44]:
absenteeism_scaler = StandardScaler()

In [45]:
absenteeism_scaler.fit(unscaled_inputs_without_dummy)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [46]:
scaled_inputs_without_dummy = absenteeism_scaler.transform(unscaled_inputs_without_dummy)

  """Entry point for launching an IPython kernel.


In [47]:
scaled_inputs_without_dummy = pd.DataFrame(columns=['Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets'], data = scaled_inputs_without_dummy)

In [48]:
scaled_inputs = pd.concat([unscaled_inputs_with_dummy,scaled_inputs_without_dummy], axis=1)

In [49]:
scaled_inputs = scaled_inputs[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets']]

In [51]:
scaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487


In [52]:
scaled_inputs.shape

(700, 14)

In [53]:
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20)



In [54]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


In [70]:
reg = LogisticRegression()

In [71]:
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [72]:
reg.score(x_train,y_train)

0.7946428571428571

In [73]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [74]:
np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7946428571428571

In [75]:
reg.intercept_

array([-1.40791592])

In [76]:
reg.coef_

array([[ 2.57074456,  0.84642676,  3.23954688,  0.41784738,  0.2113315 ,
        -0.07029256,  0.5490557 , -0.00845051, -0.15881469, -0.02062654,
         0.24058572, -0.39990029,  0.31287165, -0.4306866 ]])

In [77]:
feature_name = scaled_inputs.columns.values

In [78]:
summary_table = pd.DataFrame(columns=['Feature_Name'], data = feature_name)

In [79]:
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [80]:
summary_table

Unnamed: 0,Feature_Name,Coefficient
0,Reason_1,2.570745
1,Reason_2,0.846427
2,Reason_3,3.239547
3,Reason_4,0.417847
4,Month_Value,0.211331
5,Day,-0.070293
6,Transportation Expense,0.549056
7,Distance to Work,-0.008451
8,Age,-0.158815
9,Daily Work Load Average,-0.020627


In [81]:
summary_table.index +=1

In [82]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

In [84]:
summary_table.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature_Name,Coefficient
3,Reason_3,3.239547
1,Reason_1,2.570745
2,Reason_2,0.846427
7,Transportation Expense,0.549056
4,Reason_4,0.417847
13,Children,0.312872
11,Body Mass Index,0.240586
5,Month_Value,0.211331
8,Distance to Work,-0.008451
10,Daily Work Load Average,-0.020627


In [85]:
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficient)

In [86]:
summary_table

Unnamed: 0,Feature_Name,Coefficient,Odds_Ratio
1,Reason_1,2.570745,13.075556
2,Reason_2,0.846427,2.331302
3,Reason_3,3.239547,25.522155
4,Reason_4,0.417847,1.518689
5,Month_Value,0.211331,1.235322
6,Day,-0.070293,0.932121
7,Transportation Expense,0.549056,1.731617
8,Distance to Work,-0.008451,0.991585
9,Age,-0.158815,0.853154
10,Daily Work Load Average,-0.020627,0.979585


In [87]:
summary_table.sort_values('Odds_Ratio', ascending=False)

Unnamed: 0,Feature_Name,Coefficient,Odds_Ratio
3,Reason_3,3.239547,25.522155
1,Reason_1,2.570745,13.075556
2,Reason_2,0.846427,2.331302
7,Transportation Expense,0.549056,1.731617
4,Reason_4,0.417847,1.518689
13,Children,0.312872,1.367346
11,Body Mass Index,0.240586,1.271994
5,Month_Value,0.211331,1.235322
8,Distance to Work,-0.008451,0.991585
10,Daily Work Load Average,-0.020627,0.979585


In [88]:
reg.score(x_test,y_test)

0.7714285714285715

In [89]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.73221711, 0.26778289],
       [0.66325918, 0.33674082],
       [0.44746696, 0.55253304],
       [0.81225159, 0.18774841],
       [0.07547175, 0.92452825],
       [0.29353956, 0.70646044],
       [0.337964  , 0.662036  ],
       [0.10791013, 0.89208987],
       [0.85713206, 0.14286794],
       [0.75717958, 0.24282042],
       [0.10206708, 0.89793292],
       [0.02039985, 0.97960015],
       [0.05961214, 0.94038786],
       [0.19058688, 0.80941312],
       [0.23624658, 0.76375342],
       [0.65303233, 0.34696767],
       [0.7192901 , 0.2807099 ],
       [0.1505673 , 0.8494327 ],
       [0.3908541 , 0.6091459 ],
       [0.0368259 , 0.9631741 ],
       [0.75761046, 0.24238954],
       [0.80489769, 0.19510231],
       [0.33132255, 0.66867745],
       [0.35270343, 0.64729657],
       [0.22388403, 0.77611597],
       [0.82135426, 0.17864574],
       [0.54566683, 0.45433317],
       [0.879577  , 0.120423  ],
       [0.12364598, 0.87635402],
       [0.79732761, 0.20267239],
       [0.

In [90]:
probability_excessive_absenteeism = predicted_proba[:,1]
probability_excessive_absenteeism

array([0.26778289, 0.33674082, 0.55253304, 0.18774841, 0.92452825,
       0.70646044, 0.662036  , 0.89208987, 0.14286794, 0.24282042,
       0.89793292, 0.97960015, 0.94038786, 0.80941312, 0.76375342,
       0.34696767, 0.2807099 , 0.8494327 , 0.6091459 , 0.9631741 ,
       0.24238954, 0.19510231, 0.66867745, 0.64729657, 0.77611597,
       0.17864574, 0.45433317, 0.120423  , 0.87635402, 0.20267239,
       0.88357224, 0.63623429, 0.70487075, 0.90818326, 0.19510231,
       0.91957442, 0.1533631 , 0.81470524, 0.35186509, 0.54055153,
       0.20478229, 0.42618864, 0.15877292, 0.44368737, 0.78727177,
       0.70292051, 0.71703345, 0.27331004, 0.19253962, 0.16348976,
       0.61735451, 0.37558114, 0.68315375, 0.28130896, 0.84452784,
       0.41050952, 0.86765258, 0.22968388, 0.38333575, 0.38994759,
       0.70785985, 0.67889808, 0.30280696, 0.83129849, 0.14006971,
       0.26549611, 0.1314282 , 0.16339206, 0.81504638, 0.8442298 ,
       0.15081343, 0.33393089, 0.91690934, 0.3750165 , 0.53660

In [49]:
with open('model','wb') as file:
    pickle.dump(reg,file)