In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

In [4]:
data_preprocessed = pd.read_csv('df_preprocessed.csv')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [5]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [6]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [7]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [10]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [11]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [12]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [13]:
data_with_targets.shape

(700, 15)

In [14]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0
6,0,0,1,0,7,4,361,52,28,239.554,27,0,1,4
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0
9,0,0,1,0,7,0,235,11,37,239.554,29,1,1,1


In [15]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [16]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [17]:
unscaled_iputs_without_dummy = unscaled_inputs[['Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets']]

In [18]:
unscaled_inputs = unscaled_inputs.drop(['Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets'], axis=1)

In [19]:
absenteeism_scaler = StandardScaler()

In [20]:
absenteeism_scaler.fit(unscaled_iputs_without_dummy)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [21]:
scaled_inputs_without_dummy = absenteeism_scaler.transform(unscaled_iputs_without_dummy)

In [22]:
scaled_inputs_without_dummy = pd.DataFrame(columns=['Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets'], data = scaled_inputs_without_dummy)

In [23]:
scaled_inputs = pd.concat([unscaled_inputs,scaled_inputs_without_dummy], axis=1)

In [24]:
scaled_inputs = scaled_inputs[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day', 'Transportation Expense', 'Distance to Work','Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets']]

In [25]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
5,0,0,0,1,0.182726,1.344231,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
6,0,0,1,0,0.182726,1.344231,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,-0.01928,2.843016
7,0,0,0,1,0.182726,1.344231,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,2.679969,-0.58969
8,0,0,1,0,0.182726,-1.359682,-1.016322,-1.209478,-0.379188,-0.806331,-0.40858,0,0.880469,-0.58969
9,0,0,1,0,0.182726,-1.359682,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,-0.01928,0.268487


In [26]:
scaled_inputs.shape

(700, 14)

In [27]:
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20)

In [28]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


In [29]:
reg = LogisticRegression()

In [30]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
reg.score(x_train,y_train)

0.7982142857142858

In [32]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [33]:
np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7982142857142858

In [34]:
reg.intercept_

array([-1.58586855])

In [35]:
reg.coef_

array([[ 2.73828768,  0.93253671,  3.39124232,  0.58935717,  0.21615413,
        -0.07184056,  0.55489307, -0.00903914, -0.15630211, -0.01756899,
         0.24545234, -0.3747649 ,  0.31913466, -0.43095187]])

In [36]:
feature_name = scaled_inputs.columns.values

In [37]:
summary_table = pd.DataFrame(columns=['Feature_Name'], data = feature_name)

In [38]:
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [39]:
summary_table

Unnamed: 0,Feature_Name,Coefficient
0,Reason_1,2.738288
1,Reason_2,0.932537
2,Reason_3,3.391242
3,Reason_4,0.589357
4,Month_Value,0.216154
5,Day,-0.071841
6,Transportation Expense,0.554893
7,Distance to Work,-0.009039
8,Age,-0.156302
9,Daily Work Load Average,-0.017569


In [40]:
summary_table.index +=1

In [41]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

In [42]:
summary_table.sort_index()

Unnamed: 0,Feature_Name,Coefficient
0,Intercept,-1.585869
1,Reason_1,2.738288
2,Reason_2,0.932537
3,Reason_3,3.391242
4,Reason_4,0.589357
5,Month_Value,0.216154
6,Day,-0.071841
7,Transportation Expense,0.554893
8,Distance to Work,-0.009039
9,Age,-0.156302


In [43]:
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficient)

In [44]:
summary_table

Unnamed: 0,Feature_Name,Coefficient,Odds_Ratio
1,Reason_1,2.738288,15.460489
2,Reason_2,0.932537,2.540947
3,Reason_3,3.391242,29.70283
4,Reason_4,0.589357,1.802829
5,Month_Value,0.216154,1.241294
6,Day,-0.071841,0.930679
7,Transportation Expense,0.554893,1.741755
8,Distance to Work,-0.009039,0.991002
9,Age,-0.156302,0.855301
10,Daily Work Load Average,-0.017569,0.982584


In [45]:
summary_table.sort_values('Odds_Ratio', ascending=False)

Unnamed: 0,Feature_Name,Coefficient,Odds_Ratio
3,Reason_3,3.391242,29.70283
1,Reason_1,2.738288,15.460489
2,Reason_2,0.932537,2.540947
4,Reason_4,0.589357,1.802829
7,Transportation Expense,0.554893,1.741755
13,Children,0.319135,1.375937
11,Body Mass Index,0.245452,1.278199
5,Month_Value,0.216154,1.241294
8,Distance to Work,-0.009039,0.991002
10,Daily Work Load Average,-0.017569,0.982584


In [46]:
reg.score(x_test,y_test)

0.7714285714285715

In [47]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.73361481, 0.26638519],
       [0.66279883, 0.33720117],
       [0.44252351, 0.55747649],
       [0.81553797, 0.18446203],
       [0.07511479, 0.92488521],
       [0.29801321, 0.70198679],
       [0.34111472, 0.65888528],
       [0.10920224, 0.89079776],
       [0.85938578, 0.14061422],
       [0.75951965, 0.24048035],
       [0.10343977, 0.89656023],
       [0.02058581, 0.97941419],
       [0.06031071, 0.93968929],
       [0.19077902, 0.80922098],
       [0.23995205, 0.76004795],
       [0.6507594 , 0.3492406 ],
       [0.71992404, 0.28007596],
       [0.15244173, 0.84755827],
       [0.3927871 , 0.6072129 ],
       [0.03702562, 0.96297438],
       [0.75832279, 0.24167721],
       [0.8081202 , 0.1918798 ],
       [0.33427697, 0.66572303],
       [0.35622688, 0.64377312],
       [0.23072885, 0.76927115],
       [0.82228704, 0.17771296],
       [0.54572455, 0.45427545],
       [0.87970087, 0.12029913],
       [0.12624977, 0.87375023],
       [0.80047712, 0.19952288],
       [0.

In [48]:
probability_excessive_absenteeism = predicted_proba[:,1]
probability_excessive_absenteeism

array([0.26638519, 0.33720117, 0.55747649, 0.18446203, 0.92488521,
       0.70198679, 0.65888528, 0.89079776, 0.14061422, 0.24048035,
       0.89656023, 0.97941419, 0.93968929, 0.80922098, 0.76004795,
       0.3492406 , 0.28007596, 0.84755827, 0.6072129 , 0.96297438,
       0.24167721, 0.1918798 , 0.66572303, 0.64377312, 0.76927115,
       0.17771296, 0.45427545, 0.12029913, 0.87375023, 0.19952288,
       0.88092398, 0.6327564 , 0.7032473 , 0.90720985, 0.1918798 ,
       0.91819341, 0.1507151 , 0.80978614, 0.35369832, 0.54177019,
       0.20224176, 0.42346486, 0.15690178, 0.40395037, 0.78881252,
       0.69359877, 0.71269032, 0.27267455, 0.18928995, 0.16111483,
       0.61819751, 0.34216715, 0.68027456, 0.27963197, 0.84429757,
       0.41250043, 0.86474027, 0.22731989, 0.34705235, 0.35296253,
       0.70882795, 0.67437197, 0.30175064, 0.8268475 , 0.13861274,
       0.26357609, 0.11081235, 0.16128214, 0.80913199, 0.84474107,
       0.14857211, 0.29937419, 0.91787603, 0.37793582, 0.53875

In [49]:
with open('model','wb') as file:
    pickle.dump(reg,file)