In [1]:
print("test")

test


## Libraries and dependancies

In [60]:
import os
import json
import requests
import time
from datetime import datetime
import numpy as np
import pandas as pd
from dotenv import load_dotenv

import mlflow
import mlflow.pyfunc
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import mlflow.xgboost


load_dotenv()

True

In [None]:

# run in terminal

"""
$env:MLFLOW_S3_ENDPOINT_URL = "http://localhost:9002"
$env:AWS_ACCESS_KEY_ID = "minioadmin"
$env:AWS_SECRET_ACCESS_KEY = "minioadmin"
$env:AWS_DEFAULT_REGION = "us-east-1"
$env:MLFLOW_TRACKING_URI = "http://localhost:5000"

"""

In [19]:
mlflow.set_tracking_uri("http://localhost:5000")

In [61]:
MLFLOW_BID_PRICE_EXP = os.getenv("MLFLOW_BID_PRICE_EXP")
MLFLOW_VOLATILITY_EXP = os.getenv("MLFLOW_VOLATILITY_EXP")

## Model building

In [12]:
qdb_query = pd.read_csv("sample_data/querydata.csv")


In [13]:
qdb_query.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp          151 non-null    object 
 1   symbol             151 non-null    object 
 2   bid                151 non-null    float64
 3   bid_qty            151 non-null    float64
 4   ask                151 non-null    float64
 5   ask_qty            151 non-null    float64
 6   last               151 non-null    float64
 7   volume             151 non-null    float64
 8   vwap               151 non-null    float64
 9   low                151 non-null    float64
 10  high               151 non-null    float64
 11  change             151 non-null    float64
 12  change_pct         151 non-null    float64
 13  ma_5               151 non-null    float64
 14  ma_14              151 non-null    float64
 15  ema_5              151 non-null    float64
 16  std_14             151 non

In [14]:
qdb_query.head()

Unnamed: 0,timestamp,symbol,bid,bid_qty,ask,ask_qty,last,volume,vwap,low,...,min_14,vwap_diff,bid_ask_spread,log_return,momentum,volatility,cumulative_volume,volume_change,mean_bid_qty,mean_ask_qty
0,2025-04-22T06:30:00.000000Z,BTC/USD,88094.3,10.018319,88094.4,0.009884,88094.4,6664.60345,87578.1,86400.0,...,88085.5,516.3,0.1,0.0,5.725,80.740758,93376.232382,-0.000149,9.810157,1.258552
1,2025-04-22T06:31:00.000000Z,BTC/USD,88102.6,12.356732,88102.7,0.712864,88102.7,6664.802453,87578.1,86400.0,...,88085.5,524.075,1.675,9.421267e-05,7.775,2.192834,93304.274176,0.025018,9.864497,1.106044
2,2025-04-22T06:32:00.000000Z,BTC/USD,88102.9,0.190531,88103.0,1.548725,88103.0,4998.705475,87578.1,86400.0,...,88085.6,524.8,0.1,3.405111e-06,0.9,2.17604,69979.365814,-0.008036,9.369903,1.132442
3,2025-04-22T06:33:00.000000Z,BTC/USD,88102.9,0.037267,88103.0,5.851286,88103.0,6664.910012,87578.1,86400.0,...,88089.1,524.875,0.1,-3.4095190000000002e-18,0.125,2.094456,93307.028261,0.0001,8.149591,1.402839
4,2025-04-22T06:34:00.000000Z,BTC/USD,88102.9,1.301731,88103.0,4.117581,88103.0,9976.503686,87578.333333,86400.0,...,88094.4,524.616667,0.1,-3.4095190000000002e-18,-0.016667,1.715816,139889.464059,-0.574332,6.521099,2.042553


In [40]:
qdb_query.columns

Index(['timestamp', 'symbol', 'bid', 'bid_qty', 'ask', 'ask_qty', 'last',
       'volume', 'vwap', 'low', 'high', 'change', 'change_pct', 'ma_5',
       'ma_14', 'ema_5', 'std_14', 'price_change', 'price_change_pct',
       'max_14', 'min_14', 'vwap_diff', 'bid_ask_spread', 'log_return',
       'momentum', 'volatility', 'cumulative_volume', 'volume_change',
       'mean_bid_qty', 'mean_ask_qty', 'volatility_t+1', 'volatility_t+2',
       'volatility_t+3', 'volatility_t+4', 'volatility_t+5'],
      dtype='object')

In [54]:
def load_best_model(experiment_name):
    model = None
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment is None:
            raise ValueError(f"Experiment '{experiment_name}' not found")

        runs = mlflow.search_runs(
            experiment_ids=[experiment.experiment_id],
            order_by=["metrics.mse_t_1 ASC"],  # or accuracy, r2, etc.
        )
        
        best_run_id = runs.iloc[0]["run_id"]
        model_uri = f"runs:/{best_run_id}/model"
        model = mlflow.pyfunc.load_model(model_uri)
        
        print(f"Loaded best model from experiment '{experiment_name}', run ID: {best_run_id}")
        return model

    except Exception as e:
        print(f"Error loading best model from '{experiment_name}': {e}")
    
    return model

### BTC-Price-model

In [41]:
# Define feature columns (excluding 'timestamp' and 'symbol')
feature_cols = ['bid', 'bid_qty', 'ask', 'ask_qty', 'last',
       'volume', 'vwap', 'low', 'high', 'change', 'change_pct', 'ma_5',
       'ma_14', 'ema_5', 'std_14', 'price_change', 'price_change_pct',
       'max_14', 'min_14', 'vwap_diff', 'bid_ask_spread', 'log_return',
       'momentum', 'volatility', 'cumulative_volume', 'volume_change',
       'mean_bid_qty', 'mean_ask_qty']

# Forecasting horizon (next 5 mins)
forecast_horizon = 5

# Create future target columns
for i in range(1, forecast_horizon + 1):
    qdb_query[f"bid_t+{i}"] = qdb_query["bid"].shift(-i)

# Drop rows with NaN (caused by shifting forward)
qdb_query.dropna(inplace=True)

# Define target columns
target_cols = [f"bid_t+{i}" for i in range(1, forecast_horizon + 1)]

# Split data (80% train, 20% test)
train_size = int(len(qdb_query) * 0.8)
train, test = qdb_query.iloc[:train_size], qdb_query.iloc[train_size:]

X_train, y_train = train[feature_cols], train[target_cols]
X_test, y_test = test[feature_cols], test[target_cols]


In [42]:
X_train[:5]

Unnamed: 0,bid,bid_qty,ask,ask_qty,last,volume,vwap,low,high,change,...,min_14,vwap_diff,bid_ask_spread,log_return,momentum,volatility,cumulative_volume,volume_change,mean_bid_qty,mean_ask_qty
0,88094.3,10.018319,88094.4,0.009884,88094.4,6664.60345,87578.1,86400.0,88800.6,611.7,...,88085.5,516.3,0.1,0.0,5.725,80.740758,93376.232382,-0.000149,9.810157,1.258552
1,88102.6,12.356732,88102.7,0.712864,88102.7,6664.802453,87578.1,86400.0,88800.6,620.0,...,88085.5,524.075,1.675,9.421267e-05,7.775,2.192834,93304.274176,0.025018,9.864497,1.106044
2,88102.9,0.190531,88103.0,1.548725,88103.0,4998.705475,87578.1,86400.0,88800.6,620.3,...,88085.6,524.8,0.1,3.405111e-06,0.9,2.17604,69979.365814,-0.008036,9.369903,1.132442
3,88102.9,0.037267,88103.0,5.851286,88103.0,6664.910012,87578.1,86400.0,88800.6,620.4,...,88089.1,524.875,0.1,-3.4095190000000002e-18,0.125,2.094456,93307.028261,0.0001,8.149591,1.402839
4,88102.9,1.301731,88103.0,4.117581,88103.0,9976.503686,87578.333333,86400.0,88800.6,647.2,...,88094.4,524.616667,0.1,-3.4095190000000002e-18,-0.016667,1.715816,139889.464059,-0.574332,6.521099,2.042553


In [43]:
y_train[:5]

Unnamed: 0,bid_t+1,bid_t+2,bid_t+3,bid_t+4,bid_t+5
0,88102.6,88102.9,88102.9,88102.9,88100.0
1,88102.9,88102.9,88102.9,88100.0,88075.0
2,88102.9,88102.9,88100.0,88075.0,88046.3
3,88102.9,88100.0,88075.0,88046.3,88010.1
4,88100.0,88075.0,88046.3,88010.1,88002.8


In [None]:
mlflow.set_experiment(MLFLOW_BID_PRICE_EXP)
mlflow.autolog()

# Initialize XGBoost model for multi-output regression
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the mean absolute error for each forecast step
mae = mean_absolute_error(y_test, y_pred, multioutput="raw_values")
mse = mean_squared_error(y_test, y_pred, multioutput="raw_values")
r2 = r2_score(y_test, y_pred, multioutput="raw_values")

# Log model to MLflow
with mlflow.start_run():
    mlflow.log_param("model_type", "XGBoost")

    for i in range(forecast_horizon):
        mlflow.log_metric(f"mae_t_{i+1}", mae[i])
        mlflow.log_metric(f"mse_t_{i+1}", mse[i])
        mlflow.log_metric(f"r2_t_{i+1}", r2[i])

    # Log the model
    mlflow.xgboost.log_model(model, "model")

print("✅ XGBoost model logged in MLflow with additional metrics.")


2025/04/23 13:05:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/04/23 13:05:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/04/23 13:05:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e39bc436c1f84ffc91b5ac8d2bcc0af1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run unequaled-donkey-685 at: http://localhost:5000/#/experiments/7/runs/e39bc436c1f84ffc91b5ac8d2bcc0af1
🧪 View experiment at: http://localhost:5000/#/experiments/7


  self.get_booster().save_model(fname)


🏃 View run bouncy-boar-314 at: http://localhost:5000/#/experiments/7/runs/3b87be463ea948be998a44ce806b2491
🧪 View experiment at: http://localhost:5000/#/experiments/7
✅ XGBoost model logged in MLflow with additional metrics.


In [45]:
# predictions
y_pred[:10]

array([[88438.64 , 88448.87 , 88474.85 , 88485.44 , 88456.766],
       [88438.87 , 88447.375, 88485.03 , 88483.43 , 88456.93 ],
       [88437.734, 88445.24 , 88476.12 , 88489.16 , 88466.26 ],
       [88435.67 , 88448.38 , 88484.59 , 88487.15 , 88466.95 ],
       [88437.97 , 88447.28 , 88479.38 , 88473.555, 88466.805],
       [88434.01 , 88442.016, 88483.336, 88489.73 , 88466.875],
       [88412.266, 88443.74 , 88479.42 , 88463.82 , 88449.85 ],
       [88427.625, 88488.28 , 88484.19 , 88476.55 , 88456.08 ],
       [88403.76 , 88442.766, 88449.086, 88451.4  , 88446.03 ],
       [88418.79 , 88458.01 , 88507.266, 88485.16 , 88452.71 ]],
      dtype=float32)

In [63]:
model = load_best_model(MLFLOW_BID_PRICE_EXP)

Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00,  1.83it/s]


Loaded best model from experiment 'bid-price-model', run ID: 60f6a1a664bd479caf1ece2863f84c98


In [47]:
print(model)

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.xgboost
  run_id: 60f6a1a664bd479caf1ece2863f84c98



### Volatility model

In [48]:
qdb_query = pd.read_csv("sample_data/querydata.csv")

In [49]:
# Define feature columns (excluding 'timestamp' and 'symbol')
feature_cols = ['bid', 'bid_qty', 'ask', 'ask_qty', 'last',
       'volume', 'vwap', 'low', 'high', 'change', 'change_pct', 'ma_5',
       'ma_14', 'ema_5', 'std_14', 'price_change', 'price_change_pct',
       'max_14', 'min_14', 'vwap_diff', 'bid_ask_spread', 'log_return',
       'momentum', 'volatility', 'cumulative_volume', 'volume_change',
       'mean_bid_qty', 'mean_ask_qty']

# Forecasting horizon (next 5 mins)
forecast_horizon = 5

# Create future target columns
for i in range(1, forecast_horizon + 1):
    qdb_query[f"volatility_t+{i}"] = qdb_query["volatility"].shift(-i)

# Drop rows with NaN (caused by shifting forward)
qdb_query.dropna(inplace=True)

# Define target columns
target_cols = [f"volatility_t+{i}" for i in range(1, forecast_horizon + 1)]

# Split data (80% train, 20% test)
train_size = int(len(qdb_query) * 0.8)
train, test = qdb_query.iloc[:train_size], qdb_query.iloc[train_size:]

X_train, y_train = train[feature_cols], train[target_cols]
X_test, y_test = test[feature_cols], test[target_cols]


In [50]:
y_train[:5]

Unnamed: 0,volatility_t+1,volatility_t+2,volatility_t+3,volatility_t+4,volatility_t+5
0,2.192834,2.17604,2.094456,1.715816,0.568607
1,2.17604,2.094456,1.715816,0.568607,3.615221
2,2.094456,1.715816,0.568607,3.615221,7.933682
3,1.715816,0.568607,3.615221,7.933682,10.649338
4,0.568607,3.615221,7.933682,10.649338,10.652977


In [None]:
mlflow.set_experiment(MLFLOW_VOLATILITY_EXP)
mlflow.autolog()

# Initialize XGBoost model for multi-output regression
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the mean absolute error for each forecast step
mae = mean_absolute_error(y_test, y_pred, multioutput="raw_values")
mse = mean_squared_error(y_test, y_pred, multioutput="raw_values")
r2 = r2_score(y_test, y_pred, multioutput="raw_values")

# Log model to MLflow
with mlflow.start_run():
    mlflow.log_param("model_type", "XGBoost")

    for i in range(forecast_horizon):
        mlflow.log_metric(f"mae_t_{i+1}", mae[i])
        mlflow.log_metric(f"mse_t_{i+1}", mse[i])
        mlflow.log_metric(f"r2_t_{i+1}", r2[i])

    # Log the model
    mlflow.xgboost.log_model(model, "model")

print("✅ XGBoost model logged in MLflow with additional metrics.")


2025/04/23 13:06:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/04/23 13:06:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/04/23 13:06:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd9c19458650e48e7be16fdda2915e80d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run zealous-rook-878 at: http://localhost:5000/#/experiments/8/runs/d9c19458650e48e7be16fdda2915e80d
🧪 View experiment at: http://localhost:5000/#/experiments/8


  self.get_booster().save_model(fname)


🏃 View run luminous-lynx-156 at: http://localhost:5000/#/experiments/8/runs/26bb138a2942468aa962cdfb3fe18fe2
🧪 View experiment at: http://localhost:5000/#/experiments/8
✅ XGBoost model logged in MLflow with additional metrics.


In [52]:
y_pred[:10]

array([[ 0.07208627,  1.7114697 ,  4.794424  ,  6.696764  ,  9.137858  ],
       [ 4.273512  , 11.095244  ,  5.7864704 ,  4.544905  ,  3.1567729 ],
       [ 9.565688  , 14.237567  ,  5.488217  ,  3.1873739 ,  3.2001863 ],
       [ 9.176313  ,  6.456727  ,  7.1342897 , 12.056192  , 11.511452  ],
       [11.107666  ,  7.4208627 ,  4.773424  ,  3.6282575 ,  4.5347204 ],
       [ 9.399484  ,  3.4167745 ,  4.1402893 ,  3.4696598 ,  6.9704    ],
       [ 6.5889134 ,  7.126628  ,  4.1746264 ,  3.3947904 ,  9.315539  ],
       [ 8.5348835 ,  6.364243  ,  6.2591496 , 10.857854  ,  5.8116775 ],
       [ 4.9401855 ,  5.1113796 ,  3.6493208 ,  3.1801138 ,  5.863999  ],
       [ 7.5273027 ,  6.874158  ,  4.8317933 ,  2.9718885 ,  3.3149648 ]],
      dtype=float32)

In [None]:
model = load_best_model(MLFLOW_VOLATILITY_EXP)

Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00,  2.08it/s]


Loaded best model from experiment 'volatility-model', run ID: 26bb138a2942468aa962cdfb3fe18fe2


In [64]:
X_test[:5]

Unnamed: 0,bid,bid_qty,ask,ask_qty,last,volume,vwap,low,high,change,...,min_14,vwap_diff,bid_ask_spread,log_return,momentum,volatility,cumulative_volume,volume_change,mean_bid_qty,mean_ask_qty
116,88452.0,0.150914,88452.1,18.981612,88452.1,6720.613142,87625.9,86400.0,88800.6,781.1,...,88452.0,826.15,0.1,1e-06,0.05,0.067937,94112.939267,-0.098321,0.557329,16.504075
117,88446.1,0.006443,88446.2,17.755684,88446.2,5040.536276,87626.0,86400.0,88800.6,803.2,...,88446.1,822.1,2.066667,-6.7e-05,-3.933333,1.576079,70573.971134,0.047103,0.179512,18.143715
118,88407.0,0.227306,88407.1,11.53898,88407.0,8401.464753,87626.06,86400.0,88800.6,764.0,...,88407.0,792.36,6.5,-0.000443,-27.26,5.903274,117618.77534,0.038343,0.161438,18.186869
119,88433.9,10.802446,88434.0,0.144398,88434.0,5046.068117,87626.9,86400.0,88800.6,791.0,...,88407.0,797.733333,8.666667,0.000305,6.933333,9.785583,70577.531171,0.817507,1.003774,16.200814
120,88380.7,0.363518,88380.8,14.665756,88380.7,23563.94276,87627.407143,86400.0,88800.6,737.8,...,88380.7,786.614286,3.921429,-0.000603,-10.042857,9.78999,329689.848916,0.033691,1.206917,15.632782


### Flask test

In [6]:
import requests

response = requests.post("http://localhost:5050/retrain")


In [5]:
import requests
import numpy as np

# Define your endpoint
url = "http://localhost:5050/predict_bid"  # or /predict_volatility

# Define your test input (random or example data)
feature_cols = ['bid', 'bid_qty', 'ask', 'ask_qty', 'last', 'volume', 'vwap', 'low', 'high', 'change', 'change_pct',
                'ma_5', 'ma_14', 'ema_5', 'std_14', 'price_change', 'price_change_pct', 'max_14', 'min_14',
                'vwap_diff', 'bid_ask_spread', 'log_return', 'momentum', 'volatility', 'cumulative_volume',
                'volume_change', 'mean_bid_qty', 'mean_ask_qty']

print("Length of feature_cols:", len(feature_cols))

# Create dummy values for the test (just for testing format)
dummy_data = [0.1 * i for i in range(len(feature_cols))]

# Payload must be in correct structure
payload = {
    "data": dummy_data  # no need to send columns if model expects array
}

# Send POST request
response = requests.post(url, json=payload)

# Print response
print("Status Code:", response.status_code)
print("Response JSON:", response.json())


Length of feature_cols: 28
Status Code: 200
Response JSON: {'prediction': [[88037.9765625, 88069.8515625, 88083.6328125, 88070.7109375, 88048.1171875]], 'task': 'bid-price-model'}
