In [2]:
import numpy as np
import pandas as pd
from joblib import load
from processed_feature_mapping import mapping
from train_model import data_cleaning
from model_monitoring import ModelMonitoring

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [3]:
# 1a. Output the required cleaned data for train dataset
# Required processed data is already done in train_model.py
# Although prediction on employee_train was already done previously,
# need to do again because the dataset was jumbled up in train_model.py
train_df = pd.read_csv('../data/raw_split_data/employee_train.csv')
train_df = data_cleaning(train_df)
X_train = train_df.drop(columns=['Attrition'])

column_transformer = load('./preprocessor/column_transformer.pkl')
label_encoder = load('./preprocessor/label_encoder.pkl')
RF_clf = load('./model/RF_clf.joblib')

y_train = train_df['Attrition']
X_train_processed = column_transformer.transform(X_train)
y_train_pred = RF_clf.predict(X_train_processed)
y_train_pred_inverse = label_encoder.inverse_transform(y_train_pred)
train_df['prediction'] = y_train_pred_inverse
train_df.rename(columns={'Attrition' : 'target'}, inplace=True)
train_df.to_csv('../data/cleaned_employee_train.csv', index=False)

# 1b. Output the required cleaned/processed data for test dataset
test_df = pd.read_csv("../data/raw_split_data/employee_test.csv")
test_df = data_cleaning(test_df)
X_test = test_df.drop(columns=['Attrition'])
y_test = test_df['Attrition']
X_test_processed = column_transformer.transform(X_test)
X_test_processed = pd.DataFrame.from_records(X_test_processed)
X_test_processed = mapping(X_test_processed, column_transformer)
X_test_processed.to_csv('../data/X_test_processed.csv', index=False)
y_test_pred = RF_clf.predict(X_test_processed)
y_test_pred_inverse = label_encoder.inverse_transform(y_test_pred)
y_test_pred_prob = RF_clf.predict_proba(X_test_processed)[:1]
test_df['prediction'] = y_test_pred_inverse
test_df.rename(columns={'Attrition' : 'target'}, inplace=True)
test_df.to_csv('../data/cleaned_employee_test.csv', index=False)

In [6]:
# With the required datasets out, firstly, we create the instance of the ModelMonitoring object 
train_df = pd.read_csv('../data/cleaned_employee_train.csv')
test_df = pd.read_csv('../data/cleaned_employee_test.csv')
processed_train_df = pd.read_csv('../data/X_train_processed.csv')
processed_test_df = pd.read_csv('../data/X_test_processed.csv')
model_monitoring = ModelMonitoring(train_df)

we first have to ensure that the incoming data has to issues
for example, that the column names of the incoming data matches the training data, 
there is no difference in the data type for all columns, 
there is no data with '?' or '-' within the dataset.

In [11]:
# so we first have to run the data_check method
# this method will fix any column names that has spaces and change them to '_'
# it then replaces any bad data such as '?' or '-' and replaces the with NaN
# it then checks whether the schema of the incoming data matches the training data
# then it checks for the data types for each column of the incoming data
# lastly it also checks that the processed incoming data is the same as the processed training data
# this is to ensure the data quality of the incoming data and that the
# processing pipeline has no issues
model_monitoring.data_check(train_df, test_df, processed_train_df, processed_test_df)

train and test dataset have the same features
The data types for all columns from both datasets are the same
train and test dataset have the same processed features
