In [None]:
import config

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fancyimpute import KNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA


In [None]:
absenteeism = pd.read_csv(config.PATH)

print(absenteeism.head())


In [None]:
absenteeism.describe()

In [None]:

absenteeism.info()
# Age and Work load Average/day need to be converted from object to Integer

In [None]:
absenteeism.dtypes

In [None]:
# Check missing values

absenteeism.isnull().sum()

# Weight and Hit target has missing values

In [None]:
# Drop NAs

absenteeism = absenteeism.dropna(axis = 0)

absenteeism.isnull().sum()

In [None]:
# Check Age column values

# We see 'R' as a value for Age in a row. Update it to 0
absenteeism.loc[absenteeism.Age == 'R'] = 0

# Convert the column to numeric
absenteeism[['Age']] = absenteeism[['Age']].apply(pd.to_numeric)



In [None]:
# To convert 'Work load Average/day' column from object to numeric remove the commas in the values

absenteeism.replace(',','', regex=True, inplace=True)

# Convert the column to numeric
absenteeism["Work load Average/day "] = pd.to_numeric(absenteeism["Work load Average/day "], errors='ignore')

# Renaming the column name to remove the blank space 
absenteeism.rename(columns = {'Work load Average/day ':'Work_load_Average'}, inplace = True)



In [None]:
sns.boxplot(data = absenteeism['Age'], orient="h", palette="Set2")     

sns.boxplot(data = absenteeism['Work_load_Average'], orient="h", palette="Set2") 



In [None]:
sns.boxplot(data = absenteeism['Body mass index'], orient="h", palette="Set2") 

sns.boxplot(data = absenteeism['Weight'], orient="h", palette="Set2") 

sns.boxplot(data = absenteeism['Height'], orient="h", palette="Set2") 

In [None]:
# Outlier treatement

# Identify the columns with outlier and replace the outliers with the lower bound and upper bound values

col_wth_outlier = ["Age","Work_load_Average", "Body mass index","Weight" , "Height" ]

for col in col_wth_outlier : 
    
    q1 = absenteeism[col].quantile(.25)
    q3 = absenteeism[col].quantile(.75)
    OQR = q3 - q1
    lb = q1-(q3-q1)*1.5
    ub = q3+(q3-q1)*1.5
    absenteeism[col] = np.where(absenteeism[col] < lb, lb,absenteeism[col])
    absenteeism[col] = np.where(absenteeism[col] > ub, ub,absenteeism[col])



In [None]:
sns.boxplot(data = absenteeism['Age'], orient="h", palette="Set2")     


In [None]:
sns.boxplot(data = absenteeism['Work_load_Average'], orient="h", palette="Set2") 


In [None]:
sns.boxplot(data = absenteeism['Body mass index'], orient="h", palette="Set2") 

In [None]:
sns.boxplot(data = absenteeism['Weight'], orient="h", palette="Set2") 

sns.boxplot(data = absenteeism['Height'], orient="h", palette="Set2") 

In [None]:
# Check the correlation between the variables using heatmap

plt.figure(figsize = (20,6))
sns.heatmap(absenteeism.corr(), annot = True)
plt.show()

# With the help of correlation plot we can see that Body Mass Index and Weight has high correlation near to 1. 
# Which determines that we can drop one of the variable from the dataset. 
# Similarly Service time and Age has a significant correlation too

In [None]:
# Check the bar graph of categorical Data using factorplot
sns.set_style("whitegrid")
sns.factorplot(data=absenteeism, x='Reason for absence', kind= 'count',size=4,aspect=2)
sns.factorplot(data=absenteeism, x='Seasons', kind= 'count',size=4,aspect=2)
sns.factorplot(data=absenteeism, x='Education', kind= 'count',size=4,aspect=2)
sns.factorplot(data=absenteeism, x='Disciplinary failure', kind= 'count',size=4,aspect=2)

In [None]:
plt.hist(data=absenteeism, x='Weight', bins='auto', label='Weight')
plt.xlabel('Weight')
plt.title("Weight Distribution")

In [None]:
# Check the distribution of numerical data using histogram
plt.hist(data=absenteeism, x='Age', bins='auto', label='Age')
plt.xlabel('Age')
plt.title("Age Distribution")

In [None]:
# Fixing random state for reproducibility
np.random.seed(19680801)
N = 663
x = absenteeism['Age']
y = absenteeism['Absenteeism time in hours']
colors = np.random.rand(N)
plt.scatter(x, y, c=colors, alpha=0.2)
plt.xlabel("Age")
plt.ylabel("Absenteeism time in hours")
plt.title("Distribution of Absenteeism time by Age")
plt.show()

In [None]:
x = absenteeism['Reason for absence']
y = absenteeism['Absenteeism time in hours']
plt.scatter(x, y, alpha=0.5)
plt.xlabel("Reason for Absence")
plt.ylabel("Absenteeism time in hours")
plt.title("Distribution of Absenteeism time by Reason of Absence")
plt.show()

In [None]:
# Fixing random state for reproducibility
np.random.seed(19680801)
N = 663
x = absenteeism['Day of the week']
y = absenteeism['Absenteeism time in hours']
colors = np.random.rand(N)
plt.scatter(x, y, c=colors, alpha=0.2)
plt.xlabel("Day of the week")
plt.ylabel("Absenteeism time in hours")
plt.title("Distribution of Absenteeism time by Day of the week")
plt.show()

In [None]:
# Check for outliers using boxplots
continuous_variables = [
    'Distance from Residence to Work', 'Service time', 'Age', 'Work_load_Average',
    'Transportation expense', 'Hit target', 'Weight', 'Height', 'Body mass index', 
    'Absenteeism time in hours'
]

categorical_variables = [
    'ID','Reason for absence','Month of absence','Day of the week',
    'Seasons','Disciplinary failure', 'Education', 'Social drinker',
    'Social smoker', 'Pet', 'Son'
]

for i in continuous_variables:
    # Getting 75 and 25 percentile of variable "i"
    q75, q25 = np.percentile(absenteeism[i], [75,25])
    
    # Calculating Interquartile range
    iqr = q75 - q25
    
    # Calculating upper extream and lower extream
    minimum = q25 - (iqr*1.5)
    maximum = q75 + (iqr*1.5)
    
    # Replacing all the outliers value to NA
    absenteeism.loc[absenteeism[i]< minimum,i] = np.nan
    absenteeism.loc[absenteeism[i]> maximum,i] = np.nan


# Impute missing values with KNN
absenteeism = pd.DataFrame(KNN(k = 3).fit_transform(absenteeism), columns = absenteeism.columns)
# Checking if there is any missing value
absenteeism.isnull().sum()

In [None]:
# Check for outliers in data using boxplot
sns.boxplot(data=absenteeism[['Absenteeism time in hours','Body mass index','Height','Weight']])
fig=plt.gcf()
fig.set_size_inches(8,8)

In [None]:
# Check for outliers in data using boxplot
sns.boxplot(data=absenteeism[['Hit target','Service time','Age','Transportation expense']])
fig=plt.gcf()
fig.set_size_inches(8,8)

## Feature Selection

In [None]:
# Get dataframe with all continuous variables
absenteeism_corr = absenteeism.loc[:,continuous_variables]

In [None]:
# Check for multicollinearity using corelation graph
# Set the width and hieght of the plot
f, ax = plt.subplots(figsize=(10, 10))

# Generate correlation matrix
corr = absenteeism_corr.corr()

# Plot using seaborn library
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), 
            cmap=sns.diverging_palette(220, 50, as_cmap=True),
            square=True, ax=ax, annot = True)
plt.plot()

In [None]:
# Variable Reduction
to_drop = ['Weight']
absenteeism = absenteeism.drop(to_drop, axis = 1)

In [None]:
# Updating the Continuous and Categorical Variables
continuous_variables.remove('Weight')

In [None]:
# Make a copy of clean data and export it as excel file
clean_data = absenteeism.copy()

In [None]:
continuous_variables

## Feature Scaling

In [None]:
# Normality check
for i in continuous_variables:
    if i == 'Absenteeism time in hours':
        continue
    sns.distplot(absenteeism[i],bins = 'auto')
    plt.title("Checking Distribution for Variable "+str(i))
    plt.ylabel("Density")
    plt.show()

In [None]:
# Normalization of continuous variables
for i in continuous_variables:
    if i == 'Absenteeism time in hours':
        continue
    absenteeism[i] = (absenteeism[i] - absenteeism[i].min())/(absenteeism[i].max()-absenteeism[i].min())

## Machine Learning Models

In [None]:
# Create dummy variables of factor variables
absenteeism = pd.get_dummies(data = absenteeism, columns = categorical_variables)

# Copying dataframe
absenteeism_copy = absenteeism.copy()

In [None]:
# Get number of rows and columns
absenteeism.shape

In [None]:
# Observe the first row
absenteeism.head(1)

In [None]:
# Splitting data into train and test data

X_train, X_test, y_train, y_test = train_test_split(
    absenteeism.iloc[:, absenteeism.columns != 'Absenteeism time in hours'], 
    absenteeism.iloc[:, 8], test_size = 0.20, random_state = 1
)

## Decision Tree

In [None]:
# Build decsion tree using DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor(random_state = 1).fit(X_train, y_train)

# Perdict for test cases
decision_tree_predictions = decision_tree_model.predict(X_test)

# Create data frame for actual and predicted values
absenteeism_decision_tree = pd.DataFrame({'actual': y_test, 'pred': decision_tree_predictions})
print(absenteeism_decision_tree.head())

# Define function to calculate RMSE
def RMSE(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

# Calculate RMSE and R-squared value
print("Root Mean Squared Error: {}".format(str(RMSE(y_test, decision_tree_predictions))))
print("R#^2 Score(coefficient of determination): {}".format(str(r2_score(y_test, decision_tree_predictions))))

## Random Forest

In [None]:
# Build random forest using RandomForestRegressor
random_forest_model = RandomForestRegressor(n_estimators = 500, random_state = 1).fit(X_train,y_train)

# Perdict for test cases
random_forest_model_predictions = random_forest_model.predict(X_test)

# Create data frame for actual and predicted values
absenteeism_random_forest = pd.DataFrame({'actual': y_test, 'pred': random_forest_model_predictions})
print(absenteeism_random_forest.head())

# Calculate RMSE and R-squared value
print("Root Mean Squared Error: {}".format(str(RMSE(y_test, random_forest_model_predictions))))
print("R#^2 Score(coefficient of determination): {}".format(str(r2_score(y_test, random_forest_model_predictions))))

## Linear Regression

In [None]:
# Train the model
linear_regression_model = LinearRegression().fit(X_train , y_train)

# Perdict for test cases
linear_regression_predictions = linear_regression_model.predict(X_test)

# Create data frame for actual and predicted values
absenteeism_linear_regression = pd.DataFrame({'actual': y_test, 'pred': linear_regression_predictions})
print(absenteeism_linear_regression.head())

# Calculate RMSE and R-squared value
print("Root Mean Squared Error: {}".format(str(RMSE(y_test, linear_regression_predictions))))
print("R^2 Score(coefficient of determination): {}".format(str(r2_score(y_test, linear_regression_predictions))))

## Dimension Reduction using PCA

In [None]:
# Get the target variable
target = absenteeism['Absenteeism time in hours']

In [None]:
# Get the number of rows and columns of data
absenteeism.shape

In [None]:
# Converting data to numpy array
X = absenteeism.values

# Data has 116 variables so no of components of PCA = 115
pca = PCA(n_components=115)
pca.fit(X)

# Proportion of variance explained
e xplained_variance_ratio = pca.explained_variance_ratio_

#Cumulative scree plot
cum_sum = np.cumsum(np.round(explained_variance_ratio, decimals=4)*100)

# Draw the plot
plt.plot(cum_sum)
plt.show()

In [None]:
# Selecting 45 components since it explains almost 95+ % data variance
pca = PCA(n_components=45)

# Fitting the selected components to the data
pca.fit(X)

#S plitting data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,target, test_size=0.2, random_state = 1)

## Decision Tree

In [None]:
# Build decsion tree using DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor(random_state=1).fit(X_train,y_train)

# Perdict for test cases
decision_tree_predictions = decision_tree_model.predict(X_test)

# Create data frame for actual and predicted values
absenteeism_decision_tree = pd.DataFrame({'actual': y_test, 'pred': decision_tree_predictions})
print(absenteeism_decision_tree.head())

#C alculate RMSE and R-squared value
print("Root Mean Squared Error: {}".format(str(RMSE(y_test, decision_tree_predictions))))
print("R^2 Score(coefficient of determination): {}".format(str(r2_score(y_test, decision_tree_predictions))))

## Random Forest

In [None]:
# Build random forest using RandomForestRegressor
random_forest_model = RandomForestRegressor(n_estimators=500, random_state=1).fit(X_train,y_train)

# Perdict for test cases
random_forest_predictions = random_forest_model.predict(X_test)

# Create data frame for actual and predicted values
absenteeism_random_forest_predictions = pd.DataFrame({'actual': y_test, 'pred': random_forest_predictions})
print(absenteeism_random_forest_predictions.head())

# Calculate RMSE and R-squared value
print("Root Mean Squared Error: {}".format(str(RMSE(y_test, random_forest_predictions))))
print("R^2 Score(coefficient of determination): {}".format(str(r2_score(y_test, random_forest_predictions))))

## Linear Regression

In [None]:
# Train the model
lr_model = LinearRegression().fit(X_train , y_train)

# Perdict for test cases
lr_predictions = lr_model.predict(X_test)

# Create data frame for actual and predicted values
df_lr = pd.DataFrame({'actual': y_test, 'pred': lr_predictions})
print(df_lr.head())

# Calculate RMSE and R-squared value
print("Root Mean Squared Error: "+str(RMSE(y_test, lr_predictions)))
print("R^2 Score(coefficient of determination) = "+str(r2_score(y_test, lr_predictions)))