In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, PowerTransformer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import scipy.stats as ss
from collections import Counter

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error,median_absolute_error

In [4]:
df_absen = pd.read_csv('Absenteeism.csv', sep = ';',)
df = df_absen.copy()
df.head()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,2


We do Base Modeling by using all the features from the dataset just to make sure that te result is not too ugly.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Reason for absence               740 non-null    int64  
 1   Month of absence                 740 non-null    int64  
 2   Day of the week                  740 non-null    int64  
 3   Seasons                          740 non-null    int64  
 4   Transportation expense           740 non-null    int64  
 5   Distance from Residence to Work  740 non-null    int64  
 6   Service time                     740 non-null    int64  
 7   Age                              740 non-null    int64  
 8   Work load Average/day            740 non-null    float64
 9   Hit target                       740 non-null    int64  
 10  Disciplinary failure             740 non-null    int64  
 11  Education                        740 non-null    int64  
 12  Son                   

In [6]:
df.drop(columns = 'ID', inplace=True)

In [26]:
def evaluation_matrix(Model, X_train, X_test, y_train, y_test, name):
    y_pred_train = Model.predict(X_train)
    y_pred_test = Model.predict(X_test)
    
    r2_train = r2_score(y_train, y_pred_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mse_train = mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    
    r2_test = r2_score(y_test, y_pred_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    
    data = {
        f"Training {name}": [r2_train, mae_train, mse_train, rmse_train],
        f"Testing {name}": [r2_test, mae_test, mse_test, rmse_test]
    }
    
    df_eva = pd.DataFrame(data, index=['R2', 'MAE', 'MSE', 'RMSE'])
    return df_eva

### Start predicting by Splitting Data

In [8]:
X = df.drop(columns = 'Absenteeism time in hours')
y = df['Absenteeism time in hours']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

In [10]:
X_train

Unnamed: 0,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index
300,0,10,4,4,235,20,13,43,265.017,88,1,1,1,1,0,0,106,167,38
146,28,2,2,2,225,26,9,28,302.585,99,0,1,1,0,0,2,69,169,24
20,10,8,4,1,330,16,4,28,205.917,92,0,2,0,0,0,0,84,182,25
5,23,7,6,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31
642,23,3,5,2,248,25,14,47,222.196,99,0,1,2,0,0,1,86,165,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,11,11,4,4,118,10,10,37,284.031,97,0,1,0,0,0,0,83,172,28
406,0,3,5,3,246,25,16,41,244.387,98,1,1,0,1,0,0,67,170,23
268,8,8,3,1,361,52,3,28,265.615,94,0,1,1,1,0,4,80,172,27
344,28,12,6,4,260,50,11,36,236.629,93,0,1,4,1,0,0,65,168,23


In [11]:
X_test

Unnamed: 0,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index
634,10,3,2,2,179,51,18,38,222.196,99,0,1,0,1,0,0,89,170,31
42,23,9,3,1,179,51,18,38,241.476,92,0,1,0,1,0,0,89,170,31
533,28,11,4,4,118,10,10,37,268.519,93,0,1,0,0,0,0,83,172,28
734,13,7,2,1,369,17,12,31,264.604,93,0,1,3,1,0,0,70,169,25
44,23,9,4,1,155,12,14,34,241.476,92,0,1,2,1,0,0,95,196,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,1,7,2,1,260,50,11,36,239.554,97,0,1,4,1,0,0,65,168,23
265,23,8,3,1,235,11,14,37,265.615,94,0,3,1,0,0,1,88,172,29
273,0,9,5,1,235,20,13,43,294.217,81,1,1,1,1,0,0,106,167,38
561,14,12,5,4,361,52,3,28,280.549,98,0,1,1,1,0,4,80,172,27


In [14]:
LG_Base = LogisticRegression().fit(X_train, y_train)

In [15]:
SVM_Base = SVR().fit(X_train, y_train)

In [16]:
LR_Base = LinearRegression().fit(X_train, y_train)

In [17]:
LS_Base = Lasso().fit(X_train, y_train)

In [18]:
RD_Base = Ridge().fit(X_train, y_train)

In [19]:
KN_Base = KNeighborsRegressor().fit(X_train, y_train)

In [20]:
XG_Base = XGBRegressor().fit(X_train, y_train)

In [27]:
eva_lg_base = evaluation_matrix(LG_Base, X_train, X_test, y_train, y_test, "base lg")
eva_lg_base

Unnamed: 0,Training base lg,Testing base lg
R2,0.03291,0.064631
MAE,4.278716,4.695946
MSE,159.282095,213.790541
RMSE,12.620701,14.621578


In [28]:
eva_svm_base = evaluation_matrix(SVM_Base, X_train, X_test, y_train, y_test, "base svm")
eva_svm_base

Unnamed: 0,Training base svm,Testing base svm
R2,-0.060644,-0.01447
MAE,4.886234,5.334369
MSE,174.690609,231.870177
RMSE,13.217057,15.227284


In [29]:
eva_lr_base = evaluation_matrix(LR_Base, X_train, X_test, y_train, y_test, "base lr")
eva_lr_base

Unnamed: 0,Training base lr,Testing base lr
R2,0.169453,0.072011
MAE,5.741722,6.651987
MSE,136.79313,212.103741
RMSE,11.69586,14.563782


In [30]:
eva_ls_base = evaluation_matrix(LS_Base, X_train, X_test, y_train, y_test, "base ls")
eva_ls_base

Unnamed: 0,Training base ls,Testing base ls
R2,0.09078,-0.012006
MAE,5.712434,6.899649
MSE,149.750642,231.306975
RMSE,12.237264,15.20878


In [31]:
eva_rd_base = evaluation_matrix(RD_Base, X_train, X_test, y_train, y_test, "base rd")
eva_rd_base

Unnamed: 0,Training base rd,Testing base rd
R2,0.169319,0.069744
MAE,5.734652,6.650197
MSE,136.815095,212.621957
RMSE,11.696798,14.581562


In [32]:
eva_kn_base = evaluation_matrix(KN_Base, X_train, X_test, y_train, y_test, "base rd")
eva_kn_base

Unnamed: 0,Training base rd,Testing base rd
R2,0.349178,-0.190413
MAE,4.613851,7.274324
MSE,107.191959,272.084054
RMSE,10.353355,16.494971


In [33]:
eva_xg_base = evaluation_matrix(XG_Base, X_train, X_test, y_train, y_test, "base rd")
eva_xg_base

Unnamed: 0,Training base rd,Testing base rd
R2,0.948642,-0.425342
MAE,0.492411,6.827815
MSE,8.458725,325.780096
RMSE,2.908389,18.049379


------------------------------------------------------------------------------------------------------------------------------

# Base Modeling with Cleaned Data

In [37]:
df1 = pd.read_csv('absence_clean.csv')
df1.head()

Unnamed: 0.1,Unnamed: 0,Month of absence,Day of the week,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,...,Pet,Body mass index,Absenteeism time in hours,Reason for absence_0,Reason for absence_1,Reason for absence_2,Seasons_1,Seasons_2,Seasons_3,Seasons_4
0,0,7,3,289,36,13,33,239.554,97,0,...,1,30,4,0,0,1,1,0,0,0
1,1,7,3,118,13,18,50,239.554,97,1,...,0,31,0,1,0,0,1,0,0,0
2,2,7,4,179,51,18,38,239.554,97,0,...,0,31,2,0,0,1,1,0,0,0
3,3,7,5,279,5,14,39,239.554,97,0,...,0,24,4,0,1,0,1,0,0,0
4,4,7,5,289,36,13,33,239.554,97,0,...,1,30,2,0,0,1,1,0,0,0


In [38]:
X = df1.drop(columns = 'Absenteeism time in hours')
y = df1['Absenteeism time in hours']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

In [40]:
LG_Base = LogisticRegression().fit(X_train, y_train)

In [41]:
SVM_Base = SVR().fit(X_train, y_train)

In [42]:
LR_Base = LinearRegression().fit(X_train, y_train)

In [43]:
LS_Base = Lasso().fit(X_train, y_train)

In [44]:
RD_Base = Ridge().fit(X_train, y_train)

In [45]:
KN_Base = KNeighborsRegressor().fit(X_train, y_train)

In [46]:
XG_Base = XGBRegressor().fit(X_train, y_train)

In [47]:
eva_lg_base = evaluation_matrix(LG_Base, X_train, X_test, y_train, y_test, "base lg")
eva_lg_base

Unnamed: 0,Training base lg,Testing base lg
R2,-0.108172,-0.006377
MAE,5.413851,5.493243
MSE,182.518581,230.02027
RMSE,13.509944,15.166419


In [48]:
eva_svm_base = evaluation_matrix(SVM_Base, X_train, X_test, y_train, y_test, "base svm")
eva_svm_base

Unnamed: 0,Training base svm,Testing base svm
R2,-0.073144,-0.039225
MAE,4.989687,5.392375
MSE,176.749453,237.528212
RMSE,13.294715,15.41195


In [49]:
eva_lr_base = evaluation_matrix(LR_Base, X_train, X_test, y_train, y_test, "base lr")
eva_lr_base

Unnamed: 0,Training base lr,Testing base lr
R2,0.18376,0.077933
MAE,5.750247,6.896895
MSE,134.436672,210.750241
RMSE,11.594683,14.517239


In [50]:
eva_ls_base = evaluation_matrix(LS_Base, X_train, X_test, y_train, y_test, "base ls")
eva_ls_base

Unnamed: 0,Training base ls,Testing base ls
R2,0.131271,0.077097
MAE,5.454392,6.469503
MSE,143.081683,210.941421
RMSE,11.961676,14.523823


In [51]:
eva_rd_base = evaluation_matrix(RD_Base, X_train, X_test, y_train, y_test, "base rd")
eva_rd_base

Unnamed: 0,Training base rd,Testing base rd
R2,0.183707,0.081917
MAE,5.73844,6.861097
MSE,134.445348,209.839596
RMSE,11.595057,14.485841


In [52]:
eva_kn_base = evaluation_matrix(KN_Base, X_train, X_test, y_train, y_test, "base rd")
eva_kn_base

Unnamed: 0,Training base rd,Testing base rd
R2,0.266152,-0.041368
MAE,4.896622,6.45
MSE,120.866486,238.018108
RMSE,10.99393,15.427835


In [53]:
eva_xg_base = evaluation_matrix(XG_Base, X_train, X_test, y_train, y_test, "base rd")
eva_xg_base

Unnamed: 0,Training base rd,Testing base rd
R2,0.999195,-0.387315
MAE,0.246598,6.822421
MSE,0.132508,317.088683
RMSE,0.364016,17.806984


So from the result above, we will choose Linear Regression, Ridge and Lasso as our base model. Because by using that algo are quite balance the result that they given.