Machine learning part of Udemy's machine learning-based 
[Integrating Python, SQL, and Tableau course](https://www.udemy.com/course/python-sql-tableau-integrating-python-sql-and-tableau) which I am currently doing. 

In this notebook I:

1..Defined excessive absenteeism by getting the median of the values in the "Absenteeism Time in Hours" column.

2.Defined the targets of the logistic regression with the value 1 meaning excessive absence (more than 3 hours)

3.Selected its inputs

4.Standardized the data by scaling those inputs

5.Omitted the dummy varibles from standardization in order to keep their interpretability

6.Split the data 80-20 for the purposes of training and testing the model.

In my understanding, training does the actual machine learning and testing is performed to see how accurate 
would the model be with data it has never seen before. Splitting is necessary in order to prevent the model from failing
completely with potential new data.

7.Created the coefficients and odd ratios and interpreted them generally speaking.

8.Saved the model so that it could later be used as a part of module for predicting the probability of excessive absenteeism.



## Creating a logistic regression to predict absenteeism

## Import the relevant libraries

In [61]:
import pandas as pd
import numpy as np

## Load the data

In [62]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [63]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Creating the targets

In [64]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [65]:
import numpy as np

In [66]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0) 

In [67]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [68]:
data_preprocessed['Excessive Absenteeism'] = targets

In [69]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


% of targets with the value of 1 - excessively absent people

In [70]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [71]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Distance to Work',
       'Daily Work Load Average'], axis=1)

In [72]:
data_with_targets is data_preprocessed

False

In [73]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [74]:
data_with_targets.to_csv('absenteeism_data_with_targets.csv', index=False)

In [75]:
import pandas as pd
data_with_targets = pd.read_csv('absenteeism_data_with_targets.csv')

## Selecting the inputs for the regression

In [76]:
data_with_targets.shape

(700, 12)

In [77]:
data_with_targets.iloc[:, :-1] ## -(number) - skips the number of columns at the back of the dataframe

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [78]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

## Standardizing the data

In [79]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [80]:
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [81]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [82]:
#columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       #'Children', 'Pets']
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [83]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [84]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [85]:
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [86]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [87]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [88]:
scaled_inputs.shape

(700, 11)

## Splitting the data into train & test and shuffle

## Importing the relevant module


In [89]:
from sklearn.model_selection import train_test_split

## Splitting

In [90]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 150         0         0         0         1    -1.244823   
 106         0         0         0         1     1.610276   
 350         0         0         0         1     1.610276   
 519         0         0         0         1     1.039256   
 516         0         0         0         1     1.039256   
 ..        ...       ...       ...       ...          ...   
 69          0         0         0         1     1.039256   
 57          0         0         0         1     0.753746   
 688         0         0         0         0    -0.388293   
 241         0         1         1         0     0.182726   
 612         0         0         0         1    -1.244823   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 150                0.040034 -1.320435        -0.643782          0 -0.019280   
 106                0.040034 -1.320435        -0.643782          0 -0.019280   
 350                0.1909

In [91]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [92]:
print (x_train.shape, y_train.shape)

(560, 11) (560,)


In [93]:
print (x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic regression with sklearn

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the model

In [95]:
reg = LogisticRegression()

In [96]:
reg.fit(x_train, y_train) ## This does the machine learning - you fit the training data to the created object with your desired
                          ## learning method

In [97]:
reg.score(x_train, y_train)

0.7767857142857143

## Manually checking the accuracy

In [98]:
model_outputs = reg.predict(x_train)

In [99]:
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [100]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [101]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [102]:
np.sum((model_outputs==y_train))

435

In [103]:
model_outputs.shape[0]

560

In [104]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7767857142857143

## Finding the intercept and coefficients

In [105]:
reg.intercept_

array([-1.65498737])

In [106]:
reg.coef_

array([[ 2.81535397,  0.84606096,  2.89863212,  0.84904947,  0.15635225,
         0.61236826, -0.18276892,  0.2844636 , -0.23950897,  0.34860836,
        -0.29378892]])

In [107]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [108]:
feature_name = unscaled_inputs.columns.values

In [109]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.815354
1,Reason_2,0.846061
2,Reason_3,2.898632
3,Reason_4,0.849049
4,Month Value,0.156352
5,Transportation Expense,0.612368
6,Age,-0.182769
7,Body Mass Index,0.284464
8,Education,-0.239509
9,Children,0.348608


In [110]:
summary_table.index = summary_table.index +1 
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.654987
1,Reason_1,2.815354
2,Reason_2,0.846061
3,Reason_3,2.898632
4,Reason_4,0.849049
5,Month Value,0.156352
6,Transportation Expense,0.612368
7,Age,-0.182769
8,Body Mass Index,0.284464
9,Education,-0.239509


## Interpreting the coefficients

In [111]:
import numpy as np
summary_table['odds_ratio'] = np.exp(summary_table.Coefficient)

In [112]:
summary_table

Unnamed: 0,Feature name,Coefficient,odds_ratio
0,Intercept,-1.654987,0.191094
1,Reason_1,2.815354,16.699086
2,Reason_2,0.846061,2.330449
3,Reason_3,2.898632,18.149302
4,Reason_4,0.849049,2.337424
5,Month Value,0.156352,1.169238
6,Transportation Expense,0.612368,1.844795
7,Age,-0.182769,0.832961
8,Body Mass Index,0.284464,1.329049
9,Education,-0.239509,0.787014


In my understanding coefficients describe a feature's positive or negative influence on excessive absenteeism and the farther
it is away from zero the stronger its influence is.

Odd ratios in my understanding mean how influential would one unit change be for 
non-reason for absence features and for 4 groups of reasons it measures the likelihood of absence for a person 
with that reason in comparison to a person without one.

In [113]:
summary_table.sort_values('odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficient,odds_ratio
3,Reason_3,2.898632,18.149302
1,Reason_1,2.815354,16.699086
4,Reason_4,0.849049,2.337424
2,Reason_2,0.846061,2.330449
6,Transportation Expense,0.612368,1.844795
10,Children,0.348608,1.417094
8,Body Mass Index,0.284464,1.329049
5,Month Value,0.156352,1.169238
7,Age,-0.182769,0.832961
9,Education,-0.239509,0.787014


## Testing the model

In [114]:
reg.score(x_test, y_test)

0.75

In [115]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba ## If the probability is below 0.5 the logistic regression places 0, otherwise it places 1

array([[0.71232273, 0.28767727],
       [0.58860735, 0.41139265],
       [0.437957  , 0.562043  ],
       [0.77968175, 0.22031825],
       [0.08269235, 0.91730765],
       [0.33126001, 0.66873999],
       [0.30185953, 0.69814047],
       [0.16082456, 0.83917544],
       [0.78587101, 0.21412899],
       [0.74748664, 0.25251336],
       [0.49745668, 0.50254332],
       [0.21808096, 0.78191904],
       [0.08661585, 0.91338415],
       [0.73680393, 0.26319607],
       [0.20205009, 0.79794991],
       [0.54438121, 0.45561879],
       [0.55305631, 0.44694369],
       [0.54199759, 0.45800241],
       [0.41219207, 0.58780793],
       [0.06563828, 0.93436172],
       [0.70136638, 0.29863362],
       [0.77968175, 0.22031825],
       [0.41973614, 0.58026386],
       [0.41973614, 0.58026386],
       [0.2924967 , 0.7075033 ],
       [0.7459284 , 0.2540716 ],
       [0.50770855, 0.49229145],
       [0.85923909, 0.14076091],
       [0.24178999, 0.75821001],
       [0.77968175, 0.22031825],
       [0.

In [116]:
predicted_proba.shape

(140, 2)

In [117]:
predicted_proba[:,1]

array([0.28767727, 0.41139265, 0.562043  , 0.22031825, 0.91730765,
       0.66873999, 0.69814047, 0.83917544, 0.21412899, 0.25251336,
       0.50254332, 0.78191904, 0.91338415, 0.26319607, 0.79794991,
       0.45561879, 0.44694369, 0.45800241, 0.58780793, 0.93436172,
       0.29863362, 0.22031825, 0.58026386, 0.58026386, 0.7075033 ,
       0.2540716 , 0.49229145, 0.14076091, 0.75821001, 0.22031825,
       0.36893889, 0.68034781, 0.68821168, 0.52484293, 0.22031825,
       0.5398455 , 0.22173675, 0.84041679, 0.40191868, 0.60826174,
       0.21274585, 0.45021529, 0.23752556, 0.39914279, 0.82439221,
       0.71014221, 0.69450662, 0.28767727, 0.21995568, 0.20536517,
       0.57229979, 0.32859161, 0.66873999, 0.26931831, 0.83314649,
       0.43316956, 0.85716194, 0.23331778, 0.33428403, 0.34429053,
       0.69854054, 0.65877762, 0.29376038, 0.87572448, 0.20569933,
       0.2697341 , 0.08724093, 0.22173675, 0.68646796, 0.29925205,
       0.22173675, 0.28810733, 0.90554404, 0.45654085, 0.60529

## Saving the model

In [118]:
import pickle

In [119]:
 with open('model', 'wb') as file:
        pickle.dump(reg, file) 

In [120]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)