In [1]:
import pandas as pd
from predictor import Predictor
from datetime import datetime

## Run predictions on test set simulation 

RUN this first
```bash
docker run -p 8002:8080 xgboost_dev
```



In [2]:
def convert_date(x):
    return datetime(x['year'], x['month'], 1)

def convert_remaining_lease(x):
    # example: "60 years 11 months"
    try:
        years = int(x.split(' ')[0])
        months = int(x.split(' ')[2])
    except IndexError:
        # print(x)
        return years * 12
    total_months = years * 12 + months
    return total_months

In [3]:
# take in a part of the actual data we have, run predictions for these then store everything
# (features, pred, actual, time) somewhere for me to use

# 1. get the portion of the actual data 
# simulate pulling from data wareshouse 
df = pd.read_csv("../../data/resale_flats_transformed.csv")

# 2. shape it to something like example.csv 
input_df = pd.read_csv("example.csv")
# need to get remaining_lease_years, remaining_lease_months
df['remaining_lease_months'] = df['remaining_lease'].apply(convert_remaining_lease)
df['remaining_lease_years'] = 0
# get sale date
df['sale_date'] = df[['year', 'month']].apply(convert_date, axis=1)
# separate to test set 
df = df[df['sale_date'] >= datetime(2023, 1, 1)]

# get the columns that are in the input_df (example.csv)
df = df[input_df.columns]

# 3. run predictions on this data
predictor = Predictor(port=8002)

# # 4. store the results in a csv file
# this takes quite long 
pred_df = predictor.predict_df(df)
pred_df.to_csv("predictions.csv", index=False)

Predicted price: $312006.14
Predicted price: $319844.87
Predicted price: $298010.00
Predicted price: $340841.20
Predicted price: $302989.14
Predicted price: $360368.95
Predicted price: $360368.95
Predicted price: $396790.31
Predicted price: $368611.54
Predicted price: $552646.87
Predicted price: $243149.41
Predicted price: $356433.75
Predicted price: $335436.49
Predicted price: $350965.10
Predicted price: $313708.16
Predicted price: $329097.64
Predicted price: $356433.75
Predicted price: $350965.10
Predicted price: $350965.10
Predicted price: $352232.44
Predicted price: $328215.28
Predicted price: $306261.32
Predicted price: $349207.71
Predicted price: $338278.44
Predicted price: $344760.02
Predicted price: $347995.26
Predicted price: $361181.82
Predicted price: $344191.43
Predicted price: $331038.17
Predicted price: $352140.65
Predicted price: $329535.27
Predicted price: $309671.72
Predicted price: $356907.93
Predicted price: $503646.44
Predicted price: $435770.64
Predicted price: $39

IndexError: index 0 is out of bounds for axis 0 with size 0

In [18]:
import mlflow
import numpy as np
from scipy import stats

python -m mlflow server --host 127.0.0.1 --port 8000

## Monitoring

### ground truth evaluation

In [14]:
# check model history performance in mlflow through code

# Get the model history
model_history = mlflow.search_runs(
    filter_string="tags.model='XGBoost'"
    )
model_history

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.MAPE,metrics.MAE,metrics.MSE,params.n_estimators,params.objective,tags.mlflow.runName,tags.mlflow.user,tags.feature,tags.mlflow.source.type,tags.model,tags.mlflow.source.name,tags.mlflow.log-model.history
0,08f5fe0400f649b283b222fa850dc091,0,FINISHED,mlflow-artifacts:/0/08f5fe0400f649b283b222fa85...,2024-03-29 15:37:22.144000+00:00,2024-03-29 15:37:34.144000+00:00,0.782482,0.196303,0.070854,1000,reg:squarederror,resilient-cub-967,ivankoh,Resale Price Prediction,LOCAL,XGBoost,/Users/ivankoh/.local/lib/python3.9/site-packa...,"[{""run_id"": ""08f5fe0400f649b283b222fa850dc091""..."
1,7eba7d2fd2ae4f019ce155ba1338b95b,0,FINISHED,mlflow-artifacts:/0/7eba7d2fd2ae4f019ce155ba13...,2024-03-29 13:42:31.960000+00:00,2024-03-29 13:42:44.467000+00:00,0.782482,0.196303,0.070854,1000,reg:squarederror,casual-mule-437,ivankoh,Resale Price Prediction,LOCAL,XGBoost,/Users/ivankoh/.local/lib/python3.9/site-packa...,"[{""run_id"": ""7eba7d2fd2ae4f019ce155ba1338b95b""..."
2,68279395fa044d798337f962a4600287,0,FINISHED,mlflow-artifacts:/0/68279395fa044d798337f962a4...,2024-03-29 13:22:14.933000+00:00,2024-03-29 13:22:25.325000+00:00,0.782482,0.196303,0.070854,1000,reg:squarederror,masked-bat-604,ivankoh,Resale Price Prediction,LOCAL,XGBoost,/Users/ivankoh/.local/lib/python3.9/site-packa...,"[{""run_id"": ""68279395fa044d798337f962a4600287""..."
3,709239b403a44bef87a88a17ef5f4890,0,FINISHED,mlflow-artifacts:/0/709239b403a44bef87a88a17ef...,2024-03-29 13:20:52.053000+00:00,2024-03-29 13:21:05.995000+00:00,0.782482,0.196303,0.070854,1000,reg:squarederror,popular-gnat-419,ivankoh,Resale Price Prediction,LOCAL,XGBoost,/Users/ivankoh/.local/lib/python3.9/site-packa...,"[{""run_id"": ""709239b403a44bef87a88a17ef5f4890""..."
4,24f757752aa04c9380df54f2a0bd2695,0,FAILED,mlflow-artifacts:/0/24f757752aa04c9380df54f2a0...,2024-03-29 13:14:35.744000+00:00,2024-03-29 13:14:43.578000+00:00,0.782482,0.196303,0.070854,1000,reg:squarederror,redolent-steed-375,ivankoh,Resale Price Prediction,LOCAL,XGBoost,/Users/ivankoh/.local/lib/python3.9/site-packa...,


In [15]:
model_history["metrics.MAE"].mean(), model_history["metrics.MAPE"].mean(), model_history["metrics.MSE"].mean()

(0.19630337104889625, 0.7824820738738789, 0.07085355839569242)

In [None]:
# get the past 3 days of predictions from the predictions table 
# get the corresponding predictions and ground truths (assume ground truths collected instantly after predictions are made)
# calculate metrics and compare with training metrics, calculate difference and set threshold
# if exceed threshold then notify the team

### input drift - feature / label drift and concept drift

https://dataaspirant.com/kolmogorov-smirnov-test/
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html

In [19]:
rng = np.random.default_rng()
stats.kstest(stats.uniform.rvs(size=100, random_state=rng),
             stats.norm.cdf)

KstestResult(statistic=0.5002783402085376, pvalue=1.1384334379677198e-23, statistic_location=0.0006976954932907953, statistic_sign=-1)

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html

In [21]:
f_exp = np.array([44, 24, 29, 3]) / 100 * 189
f_obs = np.array([43, 52, 54, 40])
stats.chisquare(f_obs=f_obs, f_exp=f_exp)

Power_divergenceResult(statistic=228.23515947653874, pvalue=3.329558533884649e-49)

In [None]:
# for feature and label drift
# do ks test for continuous univariate, chi sq test for categorical feature/labels

## Feedback