In [1]:
import numpy as np
import pandas as pd
from joblib import load
from processed_feature_mapping import mapping
from train_model import data_cleaning
from model_monitoring import ModelMonitoring

In [2]:
# 1a. Output the required cleaned data for train dataset
# Required processed data is already done in train_model.py
df = pd.read_csv('../data/raw_split_data/employee_train.csv')
df = data_cleaning(df)
X_train = df.drop(columns=['Attrition'])

column_transformer = load('./preprocessor/column_transformer.pkl')
label_encoder = load('./preprocessor/label_encoder.pkl')
RF_clf = load('./model/RF_clf.joblib')

y_train = df['Attrition']
X_train_processed = column_transformer.transform(X_train)
y_train_pred = RF_clf.predict(X_train_processed)
y_train_pred_inverse = label_encoder.inverse_transform(y_train_pred)
df['prediction'] = y_train_pred_inverse
df.rename(columns={'Attrition' : 'target'}, inplace=True)
df.to_csv('../data/cleaned_employee_train.csv', index=False)

# 1b. Output the required cleaned/processed data for test dataset
test_df = pd.read_csv("../data/raw_split_data/employee_test.csv")
test_df = data_cleaning(test_df)
X_test = test_df.drop(columns=['Attrition'])
y_test = test_df['Attrition']
X_test_processed = column_transformer.transform(X_test)
X_test_processed = pd.DataFrame.from_records(X_test_processed)
X_test_processed = mapping(X_test_processed, column_transformer)
X_test_processed.to_csv('../data/X_test_processed.csv', index=False)
y_test_pred = RF_clf.predict(X_test_processed)
y_test_pred_inverse = label_encoder.inverse_transform(y_test_pred)
y_test_pred_prob = RF_clf.predict_proba(X_test_processed)[:1]
test_df['prediction'] = y_test_pred_inverse
test_df.rename(columns={'Attrition' : 'target'}, inplace=True)
test_df.to_csv('../data/cleaned_employee_test.csv', index=False)

In [5]:
# With the required datasets out, firstly, we create the instance of the ModelMonitoring object 
model_monitoring = ModelMonitoring(df)

In [7]:
# Then using the ModelMonitoring object, we first check the data quality of the incoming data
data_quality_dict = model_monitoring.data_quality_report(df.drop(columns=['target', 'prediction']),
                                     test_df.drop(columns=['target', 'prediction']),
                                     'json')
data_quality_dict

{'metrics': [{'metric': 'DatasetSummaryMetric',
   'result': {'almost_duplicated_threshold': 0.95,
    'current': {'target': None,
     'prediction': None,
     'date_column': None,
     'id_column': None,
     'number_of_columns': 30,
     'number_of_rows': 735,
     'number_of_missing_values': 0,
     'number_of_categorical_columns': 6,
     'number_of_numeric_columns': 24,
     'number_of_text_columns': 0,
     'number_of_datetime_columns': 0,
     'number_of_constant_columns': 0,
     'number_of_almost_constant_columns': 0,
     'number_of_duplicated_columns': 0,
     'number_of_almost_duplicated_columns': 0,
     'number_of_empty_rows': 0,
     'number_of_empty_columns': 0,
     'number_of_duplicated_rows': 0,
     'nans_by_columns': {'Age': 0,
      'BusinessTravel': 0,
      'DailyRate': 0,
      'Department': 0,
      'DistanceFromHome': 0,
      'Education': 0,
      'EducationField': 0,
      'EnvironmentSatisfaction': 0,
      'Gender': 0,
      'HourlyRate': 0,
      'JobIn