In [78]:
import pandas as pd
from predictor import Predictor
from datetime import datetime
import mlflow
import numpy as np
from scipy import stats
import joblib

## Run predictions on test set simulation to simulate prediciton of new data

RUN this first
```bash
docker run -p 8002:8080 xgboost_dev
```



In [79]:
def convert_date(x):
    return datetime(x['year'], x['month'], 1)

def convert_remaining_lease(x):
    # example: "60 years 11 months"
    try:
        years = int(x.split(' ')[0])
        months = int(x.split(' ')[2])
    except IndexError:
        # print(x)
        return years * 12
    total_months = years * 12 + months
    return total_months

In [80]:
# take in a part of the actual data we have, run predictions for these then store everything
# (features, pred, actual, time) somewhere for me to use

# 1. get the portion of the actual data 
# simulate pulling from data wareshouse 
df = pd.read_csv("../../data/resale_flats_transformed.csv")

# 2. shape it to something like example.csv 
input_df = pd.read_csv("example.csv")
# need to get remaining_lease_years, remaining_lease_months
df['remaining_lease_months'] = df['remaining_lease'].apply(convert_remaining_lease)
df['remaining_lease_years'] = 0
# get sale date
df['sale_date'] = df[['year', 'month']].apply(convert_date, axis=1)
# separate to test set 
df = df[df['sale_date'] >= datetime(2024, 4, 1)]

# get the columns that are in the input_df (example.csv)
df = df[input_df.columns]

# 3. run predictions on this data
predictor = Predictor(port=8002)

# # 4. store the results in a csv file
# this takes quite long 
pred_df = predictor.predict_df(df[:20])
pred_df.to_csv("predictions.csv", index=False)

Predicted price: $312006.14
Predicted price: $319844.87
Predicted price: $298010.00
Predicted price: $340841.20
Predicted price: $302989.14
Predicted price: $360368.95
Predicted price: $360368.95
Predicted price: $396790.31
Predicted price: $368611.54
Predicted price: $552646.87
Predicted price: $243149.41
Predicted price: $356433.75
Predicted price: $335436.49
Predicted price: $350965.10
Predicted price: $313708.16
Predicted price: $329097.64
Predicted price: $356433.75
Predicted price: $350965.10
Predicted price: $350965.10
Predicted price: $352232.44


python -m mlflow server --host 127.0.0.1 --port 8000

## Simulate collection of ground truth some time in future

In [81]:
df = pd.read_csv("predictions.csv")
df

Unnamed: 0,year,month,floor_area_sqm,remaining_lease_years,remaining_lease_months,flat_type,storey_range,flat_model,district,predicted_price
0,2024,4,67.0,0,653,3 ROOM,01 TO 03,New Generation,20.0,312006.14418
1,2024,4,73.0,0,637,3 ROOM,10 TO 12,New Generation,20.0,319844.871643
2,2024,4,68.0,0,666,3 ROOM,01 TO 03,New Generation,20.0,298010.00395
3,2024,4,94.0,0,1050,4 ROOM,19 TO 21,Model A,20.0,340841.201072
4,2024,4,93.0,0,1119,4 ROOM,01 TO 03,Model A,20.0,302989.136479
5,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,360368.950333
6,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,360368.950333
7,2024,4,117.0,0,641,5 ROOM,04 TO 06,Improved,20.0,396790.312791
8,2024,4,110.0,0,973,5 ROOM,01 TO 03,Improved,20.0,368611.542996
9,2024,4,148.0,0,854,EXECUTIVE,04 TO 06,Maisonette,20.0,552646.870376


In [82]:
# randomly generate actual prices based on some formula of the predicted price
# this is to simulate the actual data we have
actual_prices = []
for i, row in df.iterrows():
    pred_price = row['predicted_price']
    actual_price = pred_price + np.random.normal(0, 20000)
    actual_prices.append(actual_price)

df['actual_price'] = actual_prices
df

Unnamed: 0,year,month,floor_area_sqm,remaining_lease_years,remaining_lease_months,flat_type,storey_range,flat_model,district,predicted_price,actual_price
0,2024,4,67.0,0,653,3 ROOM,01 TO 03,New Generation,20.0,312006.14418,339419.882106
1,2024,4,73.0,0,637,3 ROOM,10 TO 12,New Generation,20.0,319844.871643,309670.344524
2,2024,4,68.0,0,666,3 ROOM,01 TO 03,New Generation,20.0,298010.00395,303064.782889
3,2024,4,94.0,0,1050,4 ROOM,19 TO 21,Model A,20.0,340841.201072,367380.195963
4,2024,4,93.0,0,1119,4 ROOM,01 TO 03,Model A,20.0,302989.136479,311626.990904
5,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,360368.950333,361601.974941
6,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,360368.950333,387408.640134
7,2024,4,117.0,0,641,5 ROOM,04 TO 06,Improved,20.0,396790.312791,419281.180582
8,2024,4,110.0,0,973,5 ROOM,01 TO 03,Improved,20.0,368611.542996,370250.258597
9,2024,4,148.0,0,854,EXECUTIVE,04 TO 06,Maisonette,20.0,552646.870376,582057.519062


In [83]:
df.to_csv("predictions_with_ground_truth.csv", index=False)

## Monitoring

### ground truth evaluation

In [84]:
# check model history performance in mlflow through code
mlflow.set_tracking_uri('http://localhost:8000')
# Get the model history
model_history = mlflow.search_runs()
model_history

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.MSE,metrics.MAPE,metrics.MAE,params.reg_lambda,...,params.reg_alpha,params.max_depth,params.subsample_freq,tags.mlflow.log-model.history,tags.mlflow.source.name,tags.mlflow.runName,tags.mlflow.source.type,tags.model,tags.mlflow.user,tags.feature
0,95525df54dc94480badb2c2e398f602c,0,FINISHED,mlflow-artifacts:/0/95525df54dc94480badb2c2e39...,2024-04-11 07:04:45.180000+00:00,2024-04-11 07:04:57.013000+00:00,0.141749,0.873252,0.260327,0.0,...,0.0,-1.0,0.0,"[{""run_id"": ""95525df54dc94480badb2c2e398f602c""...",c:\Users\valen\anaconda3\lib\site-packages\ipy...,enchanting-dog-18,LOCAL,LightGBM,valen,Resale Price Prediction
1,bfa21874d08f4798a41d7ab63ae82e5c,0,FINISHED,mlflow-artifacts:/0/bfa21874d08f4798a41d7ab63a...,2024-04-11 07:04:20.464000+00:00,2024-04-11 07:04:45.131000+00:00,0.07237,0.790721,0.198472,,...,,,,"[{""run_id"": ""bfa21874d08f4798a41d7ab63ae82e5c""...",c:\Users\valen\anaconda3\lib\site-packages\ipy...,debonair-vole-478,LOCAL,XGBoost,valen,Resale Price Prediction
2,2cc3764d90ef4609b4e57fcff522004e,0,FINISHED,mlflow-artifacts:/0/2cc3764d90ef4609b4e57fcff5...,2024-03-29 15:37:34.169000+00:00,2024-03-29 15:37:39.549000+00:00,0.131307,0.833802,0.250773,0.0,...,0.0,-1.0,0.0,"[{""run_id"": ""2cc3764d90ef4609b4e57fcff522004e""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,chill-swan-395,LOCAL,LightGBM,ivankoh,Resale Price Prediction
3,08f5fe0400f649b283b222fa850dc091,0,FINISHED,mlflow-artifacts:/0/08f5fe0400f649b283b222fa85...,2024-03-29 15:37:22.144000+00:00,2024-03-29 15:37:34.144000+00:00,0.070854,0.782482,0.196303,,...,,,,"[{""run_id"": ""08f5fe0400f649b283b222fa850dc091""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,resilient-cub-967,LOCAL,XGBoost,ivankoh,Resale Price Prediction
4,a4e061e14928418cbce808a5b90462ec,0,FINISHED,mlflow-artifacts:/0/a4e061e14928418cbce808a5b9...,2024-03-29 13:42:44.513000+00:00,2024-03-29 13:42:49.864000+00:00,0.131307,0.833802,0.250773,0.0,...,0.0,-1.0,0.0,"[{""run_id"": ""a4e061e14928418cbce808a5b90462ec""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,omniscient-toad-552,LOCAL,LightGBM,ivankoh,Resale Price Prediction
5,7eba7d2fd2ae4f019ce155ba1338b95b,0,FINISHED,mlflow-artifacts:/0/7eba7d2fd2ae4f019ce155ba13...,2024-03-29 13:42:31.960000+00:00,2024-03-29 13:42:44.467000+00:00,0.070854,0.782482,0.196303,,...,,,,"[{""run_id"": ""7eba7d2fd2ae4f019ce155ba1338b95b""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,casual-mule-437,LOCAL,XGBoost,ivankoh,Resale Price Prediction
6,b44e97e862f145f19a1f1838f7a79610,0,FINISHED,mlflow-artifacts:/0/b44e97e862f145f19a1f1838f7...,2024-03-29 13:22:25.378000+00:00,2024-03-29 13:22:30.743000+00:00,0.131307,0.833802,0.250773,0.0,...,0.0,-1.0,0.0,"[{""run_id"": ""b44e97e862f145f19a1f1838f7a79610""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,industrious-carp-854,LOCAL,LightGBM,ivankoh,Resale Price Prediction
7,68279395fa044d798337f962a4600287,0,FINISHED,mlflow-artifacts:/0/68279395fa044d798337f962a4...,2024-03-29 13:22:14.933000+00:00,2024-03-29 13:22:25.325000+00:00,0.070854,0.782482,0.196303,,...,,,,"[{""run_id"": ""68279395fa044d798337f962a4600287""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,masked-bat-604,LOCAL,XGBoost,ivankoh,Resale Price Prediction
8,709239b403a44bef87a88a17ef5f4890,0,FINISHED,mlflow-artifacts:/0/709239b403a44bef87a88a17ef...,2024-03-29 13:20:52.053000+00:00,2024-03-29 13:21:05.995000+00:00,0.070854,0.782482,0.196303,,...,,,,"[{""run_id"": ""709239b403a44bef87a88a17ef5f4890""...",/Users/ivankoh/.local/lib/python3.9/site-packa...,popular-gnat-419,LOCAL,XGBoost,ivankoh,Resale Price Prediction
9,24f757752aa04c9380df54f2a0bd2695,0,FAILED,mlflow-artifacts:/0/24f757752aa04c9380df54f2a0...,2024-03-29 13:14:35.744000+00:00,2024-03-29 13:14:43.578000+00:00,0.070854,0.782482,0.196303,,...,,,,,/Users/ivankoh/.local/lib/python3.9/site-packa...,redolent-steed-375,LOCAL,XGBoost,ivankoh,Resale Price Prediction


In [85]:
def getmodelhist(model_name):
    model_history = mlflow.search_runs()
    model_history = model_history[model_history["tags.model"] == model_name]

    mean_mse = model_history["metrics.MSE"].mean()
    mean_mae = model_history["metrics.MAE"].mean()
    mean_mape = model_history["metrics.MAPE"].mean()

    return {
        "model_name": model_name,
        "metrics" : {
            "mean_mse": mean_mse,
            "mean_mae": mean_mae,
            "mean_mape": mean_mape
        }
    }



In [86]:
getmodelhist("XGBoost")

{'model_name': 'XGBoost',
 'metrics': {'mean_mse': 0.07110626937122871,
  'mean_mae': 0.19666476107049982,
  'mean_mape': 0.7838551752810231}}

In [87]:
getmodelhist("LightGBM")

{'model_name': 'LightGBM',
 'metrics': {'mean_mse': 0.1339175667494985,
  'mean_mae': 0.2531615249495469,
  'mean_mape': 0.8436645015752049}}

In [88]:
df_gt = pd.read_csv("predictions_with_ground_truth.csv")
df_gt

Unnamed: 0,year,month,floor_area_sqm,remaining_lease_years,remaining_lease_months,flat_type,storey_range,flat_model,district,predicted_price,actual_price
0,2024,4,67.0,0,653,3 ROOM,01 TO 03,New Generation,20.0,312006.14418,339419.882106
1,2024,4,73.0,0,637,3 ROOM,10 TO 12,New Generation,20.0,319844.871643,309670.344524
2,2024,4,68.0,0,666,3 ROOM,01 TO 03,New Generation,20.0,298010.00395,303064.782889
3,2024,4,94.0,0,1050,4 ROOM,19 TO 21,Model A,20.0,340841.201072,367380.195963
4,2024,4,93.0,0,1119,4 ROOM,01 TO 03,Model A,20.0,302989.136479,311626.990904
5,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,360368.950333,361601.974941
6,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,360368.950333,387408.640134
7,2024,4,117.0,0,641,5 ROOM,04 TO 06,Improved,20.0,396790.312791,419281.180582
8,2024,4,110.0,0,973,5 ROOM,01 TO 03,Improved,20.0,368611.542996,370250.258597
9,2024,4,148.0,0,854,EXECUTIVE,04 TO 06,Maisonette,20.0,552646.870376,582057.519062


In [89]:
# import scaler
TARGET_SCALER_PATH = '_scalers/target_scaler.save'
target_scaler = joblib.load(TARGET_SCALER_PATH)

In [90]:
df_gt["actual_price"] = target_scaler.transform(df_gt['actual_price'].values.reshape(-1, 1))
df_gt["predicted_price"] = target_scaler.transform(df_gt['predicted_price'].values.reshape(-1, 1))

In [91]:
df_gt

Unnamed: 0,year,month,floor_area_sqm,remaining_lease_years,remaining_lease_months,flat_type,storey_range,flat_model,district,predicted_price,actual_price
0,2024,4,67.0,0,653,3 ROOM,01 TO 03,New Generation,20.0,-0.996796,-0.830628
1,2024,4,73.0,0,637,3 ROOM,10 TO 12,New Generation,20.0,-0.949282,-1.010954
2,2024,4,68.0,0,666,3 ROOM,01 TO 03,New Generation,20.0,-1.081633,-1.050994
3,2024,4,94.0,0,1050,4 ROOM,19 TO 21,Model A,20.0,-0.822013,-0.661147
4,2024,4,93.0,0,1119,4 ROOM,01 TO 03,Model A,20.0,-1.051453,-0.999094
5,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,-0.703646,-0.696172
6,2024,4,93.0,0,1127,4 ROOM,19 TO 21,Model A,20.0,-0.703646,-0.539745
7,2024,4,117.0,0,641,5 ROOM,04 TO 06,Improved,20.0,-0.482878,-0.34655
8,2024,4,110.0,0,973,5 ROOM,01 TO 03,Improved,20.0,-0.653683,-0.64375
9,2024,4,148.0,0,854,EXECUTIVE,04 TO 06,Maisonette,20.0,0.461844,0.640116


In [92]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error


In [93]:
# Get performance metrics of new test data after ground truth collected
mape_new = mean_absolute_percentage_error(df_gt['actual_price'], df_gt['predicted_price'])
mae_new = mean_absolute_error(df_gt['actual_price'], df_gt['predicted_price'])
mse_new = mean_squared_error(df_gt['actual_price'], df_gt['predicted_price'])

# Get similar performance metrics during training
mape_train = getmodelhist("XGBoost")['metrics']['mean_mape']
mae_train = getmodelhist("XGBoost")['metrics']['mean_mae']
mse_train = getmodelhist("XGBoost")['metrics']['mean_mse']

In [94]:
# print Train and new, mae,mse,mape
print(f"Train MAPE: {mape_train}, New MAPE: {mape_new}")
print(f"Train MAE: {mae_train}, New MAE: {mae_new}")
print(f"Train MSE: {mse_train}, New MSE: {mse_new}")

Train MAPE: 0.7838551752810231, New MAPE: 0.12496984231085032
Train MAE: 0.19666476107049982, New MAE: 0.08772312329820715
Train MSE: 0.07110626937122871, New MSE: 0.011159759327197216


In [95]:
# check if deviation by more than 5 percentage points then notify the team
if mape_new > mape_train + 0.05:
    # notify team
    print(f"MAPE of new test data is {100* (mape_train - mape_new)} percentage points lower than training data")
# if 5% drop in mae
if (mae_new - mae_train)/ mae_train < -0.05:
    # notify team
    print(f"MAE of new test data is {mae_train - mae_new} lower than training data")
# if 5% drop in mse
if mse_new > mse_train + 0.05:
    # notify team
    print(f"MSE of new test data is {mse_train - mse_new} lower than training data")

MAE of new test data is 0.10894163777229267 lower than training data


### input drift - feature / label drift and concept drift

https://dataaspirant.com/kolmogorov-smirnov-test/
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html

In [31]:
rng = np.random.default_rng()
stats.kstest(stats.uniform.rvs(size=100, random_state=rng),
             stats.norm.cdf)

KstestResult(statistic=0.5105188890265542, pvalue=1.0631972440728511e-24, statistic_location=0.026370000514567615, statistic_sign=-1)

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html

In [32]:
f_exp = np.array([44, 24, 29, 3]) / 100 * 189
f_obs = np.array([43, 52, 54, 40])
stats.chisquare(f_obs=f_obs, f_exp=f_exp)

Power_divergenceResult(statistic=228.23515947653874, pvalue=3.3295585338846486e-49)

In [33]:
# for feature and label drift
# do ks test for continuous univariate, chi sq test for categorical feature/labels

## Feedback