In [1]:
import numpy as np
import pandas as pd
from joblib import load
from processed_feature_mapping import mapping
from train_model import data_cleaning
from model_monitoring import ModelMonitoring

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
# 1a. Output the required cleaned data for train dataset
# Required processed data is already done in train_model.py
train_df = pd.read_csv('../data/raw_split_data/employee_train.csv')
train_df = data_cleaning(train_df)
X_train = train_df.drop(columns=['Attrition'])

column_transformer = load('./preprocessor/column_transformer.pkl')
label_encoder = load('./preprocessor/label_encoder.pkl')
RF_clf = load('./model/RF_clf.joblib')

y_train = train_df['Attrition']
X_train_processed = column_transformer.transform(X_train)
y_train_pred = RF_clf.predict(X_train_processed)
y_train_pred_inverse = label_encoder.inverse_transform(y_train_pred)
train_df['prediction'] = y_train_pred_inverse
train_df.rename(columns={'Attrition' : 'target'}, inplace=True)
train_df.to_csv('../data/cleaned_employee_train.csv', index=False)

# 1b. Output the required cleaned/processed data for test dataset
test_df = pd.read_csv("../data/raw_split_data/employee_test.csv")
test_df = data_cleaning(test_df)
X_test = test_df.drop(columns=['Attrition'])
y_test = test_df['Attrition']
X_test_processed = column_transformer.transform(X_test)
X_test_processed = pd.DataFrame.from_records(X_test_processed)
X_test_processed = mapping(X_test_processed, column_transformer)
X_test_processed.to_csv('../data/X_test_processed.csv', index=False)
y_test_pred = RF_clf.predict(X_test_processed)
y_test_pred_inverse = label_encoder.inverse_transform(y_test_pred)
y_test_pred_prob = RF_clf.predict_proba(X_test_processed)[:1]
test_df['prediction'] = y_test_pred_inverse
test_df.rename(columns={'Attrition' : 'target'}, inplace=True)
test_df.to_csv('../data/cleaned_employee_test.csv', index=False)

In [3]:
# With the required datasets out, firstly, we create the instance of the ModelMonitoring object 
model_monitoring = ModelMonitoring(train_df)

In [4]:
# But first, we check for the data types
model_monitoring.check_data_types(train_df, test_df)

The data types for all columns from both datasets are the same


In [5]:
# next we replace any ' ' in the column names with '_' for test_df
# it is not done for train_df because this should have already been handled 
# before the model is even being trained for the first time
model_monitoring.replace_column_names(test_df)

In [6]:
# then check to make sure that the column names of both datasets tally
model_monitoring.check_schema(train_df, test_df)

train and test dataset have the same features


1

In [7]:
# next we check for bad data i.e. cells that are by right empty but are filled with '?' and '-'
# and replace them with np.nan
model_monitoring.handle_bad_data(test_df)

Unnamed: 0,Age,target,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,prediction
0,36,No,2,429,Research & Development,2,4,Life Sciences,3,Female,...,4,0,18,2,3,16,14,5,12,No
1,36,No,3,1302,Research & Development,6,4,Life Sciences,1,Male,...,4,1,9,3,3,3,2,0,2,No
2,37,No,2,309,Sales,10,4,Life Sciences,4,Female,...,3,3,8,5,3,1,0,0,0,No
3,41,No,2,1283,Research & Development,5,5,Medical,2,Male,...,1,0,7,5,2,4,2,0,3,No
4,43,No,2,244,Human Resources,2,3,Life Sciences,2,Male,...,2,0,10,5,3,9,7,1,8,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730,34,No,2,943,Research & Development,9,3,Life Sciences,4,Male,...,4,1,10,0,2,9,7,1,6,No
731,38,No,2,1084,Research & Development,29,3,Technical Degree,4,Male,...,1,1,9,3,1,7,7,1,7,No
732,27,Yes,2,135,Research & Development,17,4,Life Sciences,4,Female,...,4,0,8,2,3,8,2,7,7,Yes
733,45,No,1,1238,Research & Development,1,1,Life Sciences,3,Male,...,4,1,25,3,2,23,15,14,4,No


In [8]:
# after cleaning up the incoming data, using evidentlyai, we get a json/html file
# to visualize and get more insights about the data
data_quality_dict = model_monitoring.data_quality_report(train_df.drop(columns=['target', 'prediction']),
                                     test_df.drop(columns=['target', 'prediction']),
                                     'json')
data_quality_dict

{'metrics': [{'metric': 'DatasetSummaryMetric',
   'result': {'almost_duplicated_threshold': 0.95,
    'current': {'target': None,
     'prediction': None,
     'date_column': None,
     'id_column': None,
     'number_of_columns': 30,
     'number_of_rows': 735,
     'number_of_missing_values': 0,
     'number_of_categorical_columns': 6,
     'number_of_numeric_columns': 24,
     'number_of_text_columns': 0,
     'number_of_datetime_columns': 0,
     'number_of_constant_columns': 0,
     'number_of_almost_constant_columns': 0,
     'number_of_duplicated_columns': 0,
     'number_of_almost_duplicated_columns': 0,
     'number_of_empty_rows': 0,
     'number_of_empty_columns': 0,
     'number_of_duplicated_rows': 0,
     'nans_by_columns': {'Age': 0,
      'BusinessTravel': 0,
      'DailyRate': 0,
      'Department': 0,
      'DistanceFromHome': 0,
      'Education': 0,
      'EducationField': 0,
      'EnvironmentSatisfaction': 0,
      'Gender': 0,
      'HourlyRate': 0,
      'JobIn

In [9]:
# We can also obtain a visual report for data quality as a html file
# this html file can be found in the html_reports folder, with the name:
# data_quality_report.html
data_quality_report = model_monitoring.data_quality_report(train_df.drop(columns=['target', 'prediction']),
                                     test_df.drop(columns=['target', 'prediction']),
                                     'html')
data_quality_report

Based on this data_quality_dict, we can obtain some information about the incoming data.
Such as:  
the number of duplicated columns, the number of duplcated rows, 
the number of empty rows, the number of empty columns and 
the number of NaNs in each columns

So we can use the values obtained from this dictionary as an indicator such that only when these numbers are more than zero than we shall do something about them

The next few functions will make use of this dictionary

In [10]:
# we handle the missing values of test_df first
test_df = model_monitoring.handle_missing_values(data_quality_dict, 'delete', test_df)

In [11]:
# then once we've finished preparing the incoming dataset for preprocessing, we let it go through the preprocessing pipeline
# after going through the preprocessing pipeline, need to check whether the validity of the preprocessing
X_train_processed = pd.read_csv("../data/X_train_processed.csv")
X_test_processed = pd.read_csv("../data/X_test_processed.csv")
model_monitoring.check_schema_postprocessing(X_train_processed, X_test_processed)

train and test dataset have the same processed features


1