In [1]:
import sys 
sys.executable  # Display the path to the Python executable ensuring the correct env"				

'C:\\Users\\Adespotos\\anaconda3\\envs\\Absenteeism\\python.exe'

# Import Libraries and Read Data

In [2]:
import numpy as np  # For numerical operations and arrays.	
import pandas as pd  # For data manipulation and analysis.	
import matplotlib.pyplot as plt  # For basic plotting.	
import seaborn as sns  # For enhanced plotting.	
from python_scripts import range_monthdays 

In [3]:
# Read CSV Datafile to a DataFrame:
new_raw_data = pd.read_csv('unseen_original.csv')

# Rename the DataFrame to avoid missing variable errors in the subsequent code stages:
raw_data = new_raw_data.copy()

In [4]:
pd.options.display.max_columns = None

# New Data Preprocessing

We 'll delete all text and comments as they are exactly the same with the 'absenteeism_data_cleaning.ipynb'.

## New Data Date Column

In [5]:
df_date = raw_data.copy()

In [6]:
df_date['Date'] = pd.to_datetime(df_date['Date'], format="%d/%m/%Y")
df_date['Weekday'] = df_date['Date'].dt.weekday.apply(lambda x: x + 1 if x < 5 else 6) 
df_date['Month'] = df_date['Date'].dt.month 
df_date['Monthday'] = df_date['Date'].dt.day
df_date['Monthday'] = df_date['Monthday'].apply(range_monthdays)

In [7]:
df_date = df_date.drop(columns='Date', axis=1)

## New Data ID Column

In [8]:
df_id = df_date.copy()

In [9]:
df_id = df_id.drop(columns='ID', axis=1) 

## New Data Reason for Absence Column

In [10]:
df_reason_for_absence = df_id.copy()

In [11]:
reason_for_absence = pd.get_dummies(df_reason_for_absence['Reason for Absence']).astype(int)
reason_for_absence.sum(axis=1).unique()  

array([1])

In [12]:
df_reason_for_absence = df_reason_for_absence.drop(columns='Reason for Absence', axis=1)

In [13]:
disease = reason_for_absence.loc[:, 0:14].sum(axis=1) 
pregnancy = reason_for_absence.loc[:, 15:17].sum(axis=1) 
other_factors = reason_for_absence.loc[:, 18:21].sum(axis=1) 
not_major_health_issues = reason_for_absence.loc[:, 22:28].sum(axis=1)  

In [14]:
df_reason_for_absence = pd.concat(
    [df_reason_for_absence, disease, pregnancy, other_factors, not_major_health_issues], axis=1)

In [15]:
corrected_column_names = ['Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 
                          'Body Mass Index', 'Education', 'Children', 'Pets', 'Weekday Absence Occurred', 
                          'Month Absence Occurred', 'Monthday Range Absence Occurred', 'Disease Absence', 
                          'Pregnancy Absence', 'Other Factor Absence', 'Not-Major Issue Absence']
df_reason_for_absence.columns = corrected_column_names

In [16]:
df_reason_for_absence = df_reason_for_absence.drop(columns='Pregnancy Absence')  

## New Data Education Column

In [17]:
df_education = df_reason_for_absence.copy()

In [18]:
df_education['Education'] = df_education['Education'].apply(lambda x: 0 if x==1 else 1) 

## New Data Children Column

In [19]:
df_children = df_education.copy()

In [20]:
df_children['Children'] = df_children['Children'].apply(lambda x: x if x <= 2 else 3)
children_dummies = pd.get_dummies(df_children['Children'], drop_first=True).astype(int) 

In [21]:
children_dummies['Has 1 Child'] = children_dummies[1]
children_dummies['Has 2 Children'] = children_dummies[2]
children_dummies['Has More than 2 Children'] = children_dummies[3]
children_dummies = children_dummies.drop(columns=[1, 2, 3], axis=1)

In [22]:
df_children = df_children.drop(columns='Children', axis=1)

## New Data Pet Column

In [23]:
df_pet = df_children.copy()

In [24]:
df_pet['Pets'] = df_pet['Pets'].apply(lambda x: x if x <= 2 else 3)
pet_dummies = pd.get_dummies(df_pet['Pets'], drop_first=True).astype(int) 

In [25]:
pet_dummies['Has 1 Pet'] = pet_dummies[1]
pet_dummies['Has 2 Pets'] = pet_dummies[2]
pet_dummies['Has More than 2 Pets'] = pet_dummies[3]
pet_dummies = pet_dummies.drop(columns=[1, 2, 3], axis=1)

In [26]:
df_pet = df_pet.drop(columns='Pets', axis=1)

# New Data Final DataFrame

In [27]:
df_final = pd.concat(
    [df_pet, children_dummies, pet_dummies], axis=1)

In [28]:
reordered_cols_list = ['Month Absence Occurred', 'Monthday Range Absence Occurred', 'Weekday Absence Occurred', 
                       'Disease Absence', 'Other Factor Absence', 'Not-Major Issue Absence', 'Education', 
                       'Has 1 Child', 'Has 2 Children', 'Has More than 2 Children', 'Has 1 Pet', 'Has 2 Pets', 
                       'Has More than 2 Pets', 'Transportation Expense', 'Distance to Work', 'Age', 
                       'Daily Work Load Average', 'Body Mass Index']
df_final = df_final[reordered_cols_list]

In [29]:
# We need this code to meet the backward elimination we applied on the 'absenteeism_model.ipynb':
df_final = df_final.drop(columns=['Month Absence Occurred', 'Monthday Range Absence Occurred', 'Has 1 Child', 'Has 1 Pet',
                                                          'Daily Work Load Average', 'Distance to Work'], axis=1)

# Export New Data Cleaned DataFrame

In [30]:
df_final.to_csv('unseen_cleaned.csv', index=False)