## IMPORT LIBRARIES

In [None]:
import pandas as pd
import numpy as np

## LOAD DATA

In [None]:
data_preprocessed = pd.read_csv('AbsenteeismData_Preprocessed.csv')

In [None]:
data_preprocessed.head()

## CREATE TARGETS

In [None]:
data_preprocessed['Absenteeism Time in Hours'].median()

In [None]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [None]:
targets

In [None]:
data_preprocessed['Excessive Absenteeism'] = targets

In [None]:
data_preprocessed.head()

In [None]:
targets.sum() / targets.shape[0] #percent of 1s

In [None]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)

In [None]:
data_with_targets is data_preprocessed

In [None]:
data_with_targets.head()

## SELECT INPUTS FOR REGRESSION

In [None]:
data_with_targets.shape

In [None]:
data_with_targets.iloc[:, 0:14] #all rows, first 14 columns are inputs.

In [None]:
data_with_targets.iloc[:, :-1] #same as above, skip last column.

In [None]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

## STANDARDIZE DATA

In [None]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler() #standard scaler scales dummy variables also, so write custom one.

In [None]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [None]:
unscaled_inputs.columns.values

In [None]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [None]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [None]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [None]:
absenteeism_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [None]:
scaled_inputs

In [None]:
scaled_inputs.shape

## SPLIT, TRAIN, TEST, SHUFFLE

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_test_split(scaled_inputs, targets)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state=20) #train size is 80%

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

## LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## TRAIN MODEL

In [None]:
reg = LogisticRegression()

In [None]:
reg.fit(x_train, y_train)

In [None]:
reg.score(x_train, y_train)

### MANUAL ACCURACY CHECK

In [None]:
model_outputs = reg.predict(x_train)
model_outputs

In [None]:
y_train

In [None]:
model_outputs == y_train

In [None]:
np.sum(model_outputs==y_train) #number of true.

In [None]:
model_outputs.shape[0]

In [None]:
np.sum(model_outputs==y_train) / model_outputs.shape[0] #accuracy

## FIND INTERCEPTS AND COEFFICIENTS

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
unscaled_inputs.columns.values

In [None]:
feature_name = unscaled_inputs.columns.values

In [None]:
summary_table = pd.DataFrame(columns=['Feature Name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

In [None]:
summary_table.index = summary_table.index + 1 #shift all indexes by one.
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] #add intecept column.
summary_table = summary_table.sort_index()
summary_table

## INTERPRETING COEFFICIENTS

In [None]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table #for a unit change in standardized feature odds increase by a multiple equal to the odds ratio. (1=no change)

In [None]:
summary_table.sort_values('Odds_ratio', ascending=False) 
#features with coefficient around 0 and odd ratio around 1 is not important.
#daily work load average is least important in features. others day of week, distance to work.

## TESTING MODEL

In [None]:
reg.score(x_test, y_test)

In [None]:
predicted_proba = reg.predict_proba(x_test) #predicted probability
predicted_proba

In [None]:
predicted_proba.shape

In [None]:
predicted_proba[:, 1]

## SAVE MODEL

In [None]:
import pickle

In [None]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [None]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)