In [4]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

# The custom scaler class
class CustomScaler(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns,copy = True, wiyh_mean = True, with_std = True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X,y = None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns = self.columns)
        X_not_scaled =X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis = 1)[init_col_order]
    
# Create special class that we are going to use from here to predict new data

class absenteeism_model():
    
    def __init__(self,model_file,scaler_file):
        
        with open('model','rb') as model_file, open('scaler','rb') as scaler_file:
            self.reg = pickle.load(model_file)
            self.scaler = pickle.load(scaler_file)
            self.data = None
            
    def load_and_clean_data(self,data_file):
        
        df = pd.read_csv(data_file,delimiter = ',')
        
        #store the data in new variable for later use
        self.df_with_predictions = df.copy()
        
        #drop the 'ID' column
        df = df.drop(['ID'],axis = 1)
        
        # to preserve the code we have created inprevious section ,we will add the column with NAN string
        df['Absenteeism Time in Hours'] = 'NaN'
        
        #create separate dataframe to create a dummy variable
        reason_columns = pd.get_dummies(df['Reason for Absense'])
        
        #split reason column into 4 types
        reason_type_1 = reason_columns.loc[:,1:14].max(axis = 1)
        reason_type_2 = reason_columns.loc[:,15:17].max(axis = 1)
        reason_type_3 = reason_columns.loc[:,18:21].max(axis = 1)
        reason_type_4 = reason_columns.loc[:,22:].max(axis = 1)
        
        #drop 'Reason for ABSENCE' to avoid multicollinearity
        df = df.drop(['Reason for Absence'],axis = 1)
        
        #concatenate df and 4 types
        df = pd.concat([df,reason_type_1,reason_type_2,reason_type_3,reason_type_4],axis = 1)
        
        #Assign Names to 4 Types
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
        
        df.columns = column_names
        
        # Reorder the columns in df
        column_names_reordered = [ 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
        
        df = df[column_names_reordered]
        
        # Convert 'date' to datetime 
        df['Date'] = pd.to_datetime(df['Date'],format = '%d/%m/%Y')
        
        # Create a list with month Values
        list_month = []
        
        for i in range(df.shape[0]):
            list_month.append(df['Date'][i].month)
        
        df['Month Value'] = list_month
        
        # Createlist with 'day of the week'
        def date_to_weekday(date_value):
            return date_value.weekday()
        
        df['Day of the week'] = df['Date'].apply(date_to_weekday)
        
        #drop date column
        df = df.drop(['Date'],axis = 1)
        
        # Reorder the columns names
        column_names_updated = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value','Day of the week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
        
        df = df[column_names_updated]
        
        #map Education into dummy
        df['Education'] = df['Education'].map({1:0,2:1,3:1,4:1})
        
        # replace the NaNValues
        df = df.fillna(value = 0)
        
        # drop the original Absenteeism time 
        df = df.drop(['Absenteeism Time in Hours'],axis = 1)
        
        #drop the variables we decided we dont need
        df = df.drop(['Day of the week', 'Distance to Work', 'Daily Work Load Average'],axis = 1)
        
        #checkpoint
        self.preprocessed_data = df.copy()
        
        # we need this line so we can use it in next function
        self.data = self.scalar.transform(df)
        
    # A function which outputs the probability of data to be 1
    def predicted_probability(self):
        if(self.data is not None):
            pred = self.reg.predict_proba(self.data)[:,1]
            return pred
        
    # A function which outputs 0 or 1 
    def predicted_output_category(self):
        if(self.data is not None):
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs
    
    # Predict the outputs and probabilities and 
    # add column with these values at the end of the data
    def predicted_outputs(self):
        if(self.data is not None):
            self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
            self.preprocessed_data['Prediction'] = self.reg.predict(self.data)
            return self.preprocessed_data