# ML Time

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

We can create two classes, one for people who have been moderaterly absent, and another for those that have been excessively absent.

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

Less or equal to 3 is moderately absent. Above 3 is excessively absent.

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(),
                   1,0)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets

In [7]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [8]:
targets.sum()/targets.shape[0]

0.45571428571428574

Around 46% of the targets are 1s.
This is a fine balance for this type of work. Neural nets need a lot closer, but this is fine.

In [9]:
# initial run immediately below
# data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1)

# run after full script to exclude low impact features
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)

In [10]:
data_with_targets is data_preprocessed

False

If the above is True, they are the same object, if it is False (which it is), they are different objects.

In [11]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## Select the inputs for the regression

In [12]:
data_with_targets.shape

(700, 12)

In [13]:
data_with_targets.iloc[:,0:14]

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [14]:
data_with_targets.iloc[:,:-1].head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


In [15]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardise the data

In [16]:
# from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()

The above code commented out in favour of a custom scaler which does not standardise all inputs, only the ones we choose. (We do not want to standardise dummy variables). In practice, you standardise before creating the dummy.

In [17]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None,copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[init_col_order]


In [18]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [19]:
#columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',       'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

columns_to_omit = ['Reason_1','Reason_2','Reason_3','Reason_4','Education']

columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [20]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [21]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4',
                      'Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [22]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [23]:
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,-0.577350,-0.092981,-0.314485,0.821365,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [24]:
scaled_inputs.shape

(700, 11)

## Splitting the data for training

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
train_test_split(scaled_inputs,targets)

[     Reason 1  Reason 2  Reason 3  Reason 4  Month Value  \
 261  1.732051 -0.092981 -0.314485 -1.217485     0.468236   
 113 -0.577350 -0.092981 -0.314485  0.821365    -1.530333   
 349 -0.577350 -0.092981 -0.314485  0.821365     1.610276   
 590 -0.577350 -0.092981 -0.314485  0.821365    -1.244823   
 509 -0.577350 -0.092981 -0.314485  0.821365     1.039256   
 ..        ...       ...       ...       ...          ...   
 26  -0.577350 -0.092981 -0.314485  0.821365     0.468236   
 55  -0.577350 -0.092981 -0.314485 -1.217485     0.753746   
 628 -0.577350 -0.092981 -0.314485  0.821365    -0.959313   
 585  1.732051 -0.092981 -0.314485 -1.217485    -1.244823   
 73  -0.577350 -0.092981 -0.314485  0.821365     1.039256   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 261                0.688938 -0.536062        -0.408580          1 -0.919030   
 113                0.040034 -1.320435        -0.643782          0 -0.019280   
 349               -1.5746

The output of this are 4 arrays:
- a training dataset with inputs
- a training dataset with targets
- a test dataset with inputs
- a test dataset with targets

In [27]:
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets,train_size = 0.8, shuffle=True,random_state=20)
# shuffle defaults to True, but you might want to change it or add a constant random state to make it shuffle in the same way.

In [28]:
print(x_train.shape,y_train.shape)

(560, 11) (560,)


In [29]:
print(x_test.shape,y_test.shape)

(140, 11) (140,)


## Logistic regression with sklearn

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the model

In [31]:
reg = LogisticRegression()
reg.fit(x_train,y_train)

LogisticRegression()

In [32]:
reg.score(x_train,y_train)

0.7875

Based on the data we used, our model learned to classify 80% of records correctly.

## Manually check the accuracy

In [33]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [34]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [35]:
model_outputs == y_train
# compares the output of the two variables

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [36]:
np.sum((model_outputs == y_train))

441

In [37]:
model_outputs.shape[0]

560

In [38]:
np.sum((model_outputs == y_train))/model_outputs.shape[0]

0.7875

This is another way to get to the accuracy score we got above.

## Finding the intercept and coefficients

In [39]:
reg.intercept_

array([-0.17163628])

In [40]:
reg.coef_

array([[ 2.06866621,  0.33461133,  1.56034928,  1.31301781,  0.18521305,
         0.69049022, -0.19796143,  0.32782753, -0.31318895,  0.37213564,
        -0.32435396]])

In [41]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [42]:
feature_name = unscaled_inputs.columns.values

In [43]:
summary_table = pd.DataFrame(columns=['Feature name'],data= feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.068666
1,Reason 2,0.334611
2,Reason 3,1.560349
3,Reason 4,1.313018
4,Month Value,0.185213
5,Transportation Expense,0.69049
6,Age,-0.197961
7,Body Mass Index,0.327828
8,Education,-0.313189
9,Children,0.372136


In [44]:
summary_table.index = summary_table.index +1

In [45]:
summary_table

Unnamed: 0,Feature name,Coefficient
1,Reason 1,2.068666
2,Reason 2,0.334611
3,Reason 3,1.560349
4,Reason 4,1.313018
5,Month Value,0.185213
6,Transportation Expense,0.69049
7,Age,-0.197961
8,Body Mass Index,0.327828
9,Education,-0.313189
10,Children,0.372136


In [46]:
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.171636
1,Reason 1,2.068666
2,Reason 2,0.334611
3,Reason 3,1.560349
4,Reason 4,1.313018
5,Month Value,0.185213
6,Transportation Expense,0.69049
7,Age,-0.197961
8,Body Mass Index,0.327828
9,Education,-0.313189


## Interpereting the coefficients

In [47]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [48]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.171636,0.842285
1,Reason 1,2.068666,7.91426
2,Reason 2,0.334611,1.397397
3,Reason 3,1.560349,4.760484
4,Reason 4,1.313018,3.717375
5,Month Value,0.185213,1.203475
6,Transportation Expense,0.69049,1.994693
7,Age,-0.197961,0.820401
8,Body Mass Index,0.327828,1.38795
9,Education,-0.313189,0.731112


In [49]:
summary_table.sort_values('Odds_ratio',ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason 1,2.068666,7.91426
3,Reason 3,1.560349,4.760484
4,Reason 4,1.313018,3.717375
6,Transportation Expense,0.69049,1.994693
10,Children,0.372136,1.45083
2,Reason 2,0.334611,1.397397
8,Body Mass Index,0.327828,1.38795
5,Month Value,0.185213,1.203475
0,Intercept,-0.171636,0.842285
7,Age,-0.197961,0.820401


You can consider dropping the features that dont seem to have much impact. I.e. have coefficients around 0 and odds ration around 1.

Standardisation is a choice as it increases model accuracy but makes the output less interperetable..

Machine learning engineers prefer models with higher accuracy, so they normally go for standardisation.

Econometricians and Statisticians prefer less accurate but more interperatable models because they care about the underlying reasons behind different phenomena.

Data Scientists may be in either position. Sometimes they need higher accuracy, in other times they must find the main drivers of a problem.

## Backward elimination

The idea that we can simplify our model by removing all features which have close to no contribution to the model.

When we have the p-values, we get rid of all coefficients with p-values less than 0.05. If the weight is small enough, it wont make a difference anyway.

If we remove the variables, the model shouldnt really change.

At this point the code above is changed to remove extra variables.

## Testing the model

In [50]:
reg.score(x_test,y_test)

0.7357142857142858

In [51]:
predicted_proba = reg.predict_proba(x_test)

In [52]:
predicted_proba

array([[0.70823956, 0.29176044],
       [0.57293437, 0.42706563],
       [0.39748224, 0.60251776],
       [0.78749486, 0.21250514],
       [0.06719693, 0.93280307],
       [0.31207365, 0.68792635],
       [0.28664053, 0.71335947],
       [0.08120082, 0.91879918],
       [0.80017608, 0.19982392],
       [0.74995457, 0.25004543],
       [0.46819712, 0.53180288],
       [0.18512014, 0.81487986],
       [0.04121567, 0.95878433],
       [0.75142701, 0.24857299],
       [0.23903853, 0.76096147],
       [0.53714554, 0.46285446],
       [0.53420084, 0.46579916],
       [0.52102189, 0.47897811],
       [0.40678406, 0.59321594],
       [0.02773414, 0.97226586],
       [0.70236673, 0.29763327],
       [0.78749486, 0.21250514],
       [0.40670612, 0.59329388],
       [0.40670612, 0.59329388],
       [0.17559234, 0.82440766],
       [0.75454373, 0.24545627],
       [0.48937739, 0.51062261],
       [0.87896092, 0.12103908],
       [0.13142638, 0.86857362],
       [0.78749486, 0.21250514],
       [0.

In [53]:
predicted_proba.shape

(140, 2)

Predicted proba assigns probability to the observation output being either a 0 or a 1. 

We are interested in the probability of getting 1, which is excessive absenteeism. 

We can just take the probabilities of an outcome getting 1.

In [54]:
predicted_proba[:,1]

array([0.29176044, 0.42706563, 0.60251776, 0.21250514, 0.93280307,
       0.68792635, 0.71335947, 0.91879918, 0.19982392, 0.25004543,
       0.53180288, 0.81487986, 0.95878433, 0.24857299, 0.76096147,
       0.46285446, 0.46579916, 0.47897811, 0.59321594, 0.97226586,
       0.29763327, 0.21250514, 0.59329388, 0.59329388, 0.82440766,
       0.24545627, 0.51062261, 0.12103908, 0.86857362, 0.21250514,
       0.37628326, 0.69395995, 0.71016857, 0.55802337, 0.21250514,
       0.56289228, 0.2084135 , 0.80857331, 0.41359763, 0.62953752,
       0.20379034, 0.4242538 , 0.22639945, 0.10462937, 0.85742011,
       0.6523714 , 0.70507958, 0.29176044, 0.21052579, 0.19534428,
       0.56865035, 0.07901966, 0.68792635, 0.2680619 , 0.8589383 ,
       0.45311982, 0.92991324, 0.21727659, 0.08573649, 0.08997337,
       0.70893813, 0.67646259, 0.28670381, 0.85947072, 0.18954376,
       0.27039693, 0.01391914, 0.2084135 , 0.81053586, 0.29013739,
       0.2084135 , 0.06812914, 0.92885154, 0.47883471, 0.63880

In reality, logistic regression models calculate these probabilities in the background. if the probability is below 0.5 it places a 0. if it is above 0.5 it places a 1.

## Save the model

In [55]:
import pickle

In [56]:
with open('model','wb') as file:
    pickle.dump(reg,file)

In [57]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)