In [1]:
# Only run this cell when in Google Colab
! git init
! git remote add origin https://github.com/hannamykula/ob-mts.git
! git fetch
! git checkout -t origin/main

Initialized empty Git repository in /content/.git/
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 80 (delta 43), reused 57 (delta 24), pack-reused 0[K
Unpacking objects: 100% (80/80), 662.01 KiB | 3.25 MiB/s, done.
From https://github.com/hannamykula/ob-mts
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


In [2]:
! pip install tslearn
! pip install scikit-multiflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tslearn
  Downloading tslearn-0.5.3.2-py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.2/358.2 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tslearn
Successfully installed tslearn-0.5.3.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-multiflow
  Downloading scikit_multiflow-0.5.3-cp38-cp38-manylinux2010_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-multiflow
Successfully installed scikit-multiflow-0.5.3


In [3]:
from src.process import read_and_split_data, subsample_features
from src.model import train_candidates, cluster_predictions, compute_cluster_representatives, root_mean_square_error, plot_clustering, get_best_num_of_clusters, save_validation_predictions
from src.drift import PageHinkley, DataDrift
from src.predict import predict_n_steps_for_ensemble, predict_one_step_for_ensemble, get_weights, final_prediction_ensemble
import os
import pandas as pd
from config import EXPERIMENT_NAME, VALIDATION_WINDOW_SIZE, WEIGHTS_WINDOW_SIZE, EVALUATION_WINDOW, TARGET_INDEX, SUBSET_SIZE, K, MODEL
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from statsmodels.tsa.api import VAR
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from tslearn.utils import to_time_series
import numpy as np

%load_ext autoreload
%autoreload 2

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
! wget -q --show-progress -P data/{EXPERIMENT_NAME}/ https://www.dropbox.com/s/ckbcl6ztqclq57c/air-quality-2.csv



In [6]:
# Hyperparameters
target_index = TARGET_INDEX

In [7]:
train, val, test = read_and_split_data(EXPERIMENT_NAME + '/air-quality-2.csv', val_size=VALIDATION_WINDOW_SIZE)

In [8]:
y_train = train.iloc[:, target_index]
X_train = train.drop(train.columns[target_index], axis=1)
X_test = test.drop(test.columns[target_index], axis=1)
X_test = X_test.iloc[WEIGHTS_WINDOW_SIZE:, :]
y_test = test.iloc[WEIGHTS_WINDOW_SIZE:, target_index]

X_scaler = MinMaxScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.fit_transform(X_test)

In [9]:
save_name = EXPERIMENT_NAME + '_Ground_truth.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(y_test).to_csv(path)

In [10]:
model_RF = RandomForestRegressor(max_depth=2, random_state=0)
model_RF.fit(X_train_scaled, y_train)
predictions_RF = model_RF.predict(X_test_scaled)

In [11]:
root_mean_square_error(y_test, predictions_RF)

4.1612229789169115

In [12]:
save_name = EXPERIMENT_NAME + '_Baseline_RF_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(predictions_RF).to_csv(path)

In [13]:
model_GBM = GradientBoostingRegressor()
model_GBM.fit(X_train_scaled, y_train)
predictions_GBM = model_GBM.predict(X_test_scaled)

In [14]:
root_mean_square_error(y_test, predictions_GBM)

3.0965644298785233

In [15]:
save_name = EXPERIMENT_NAME + '_Baseline_GBM_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(predictions_GBM).to_csv(path)

In [16]:
lag = 5
var_y_train = y_train.to_numpy()
var_y_train = np.reshape(var_y_train, (-1, 1))
var_train = np.concatenate((X_train_scaled, var_y_train), axis=1)
var = VAR(var_train)
model = var.fit(lag)

var_y_test = y_test.to_numpy()
var_y_test = np.reshape(var_y_test, (-1, 1))
var_test = np.concatenate((X_test_scaled, var_y_test), axis=1)

prediction_VAR = []
for row in var_test:
    new = np.concatenate((var_train, np.reshape(row, (1, -1))), axis = 0)
    pred_t = model.forecast(y=new[-lag:], steps=1)
    prediction_VAR.append(pred_t[:, pred_t.shape[1] - 1][0])

print(root_mean_square_error(y_test, prediction_VAR))

0.436054924937257


In [17]:
save_name = EXPERIMENT_NAME + '_Baseline_VAR_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_VAR).to_csv(path)

In [18]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
prediction_lasso = lasso.predict(X_test_scaled)

In [19]:
root_mean_square_error(y_test, prediction_lasso)

1.9220285477040229

In [20]:
save_name = EXPERIMENT_NAME + '_Baseline_Lasso_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_lasso).to_csv(path)

In [21]:
from src.oamts import select_relevant

# Is scaled
relevant_X_train, relevant_X_test = select_relevant(X_train_scaled, y_train, X_test_scaled, 10)
ph = PageHinkley(delta=0.005, threshold=0.025)
hoeffding = DataDrift(threshold=0.97)
X_full_scaled = np.concatenate((X_train_scaled, X_test_scaled), axis = 0)
y_full = np.concatenate((y_train.to_numpy(), y_test.to_numpy()), axis = 0)

X_train_last_i = X_train_scaled.shape[0] - 1


In [22]:
var_relevant_train = np.concatenate((relevant_X_train, var_y_train), axis=1)
var_relevant = VAR(var_relevant_train)
model_relevant = var_relevant.fit(lag)

var_relevant_test = np.concatenate((relevant_X_test, var_y_test), axis=1)

prediction_relevant_VAR = []
evaluation_sliding_window_start = 0
counter = 0
for row in var_relevant_test:
    new = np.concatenate((var_relevant_train, np.reshape(row, (1, -1))), axis = 0)
    pred_t = model_relevant.forecast(y=new[-lag:], steps=1)
    prediction_relevant_VAR.append(pred_t[:, pred_t.shape[1] - 1][0])
    if(len(prediction_relevant_VAR) >= EVALUATION_WINDOW):
        y = y_test[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        y_predicted = prediction_relevant_VAR[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        error = root_mean_square_error(y.to_numpy(), y_predicted)
        print(f'Error for window [{evaluation_sliding_window_start}:{evaluation_sliding_window_start+EVALUATION_WINDOW}] is {error}')
        ph.add_element(error)
        print(f'Difference: {ph.sum - ph.minimum}')

        glob_min = hoeffding.min
        hoeffding.add_element(pd.DataFrame(new[X_train_last_i+counter:,0:(row.shape[0] - 1)]), y)
        print(f'Hoeffding global minimum: {glob_min}. Latest minimum: {hoeffding.min2}. Sample count: {hoeffding.sample_count}')
        if ph.detected_change():
            print('Change in error detected.')
        if hoeffding.detected_change():
            print('Change in data detected')
        if ph.detected_change() | hoeffding.detected_change():
            relevant_X_train, relevant_X_test = select_relevant(X_full_scaled[:X_train_last_i+counter,], y_full[:X_train_last_i+counter,], X_full_scaled[X_train_last_i+counter+1:, ], 10)
            var_y_train = y_full[:X_train_last_i+counter,]
            var_y_train = np.reshape(var_y_train, (-1, 1))
            var_relevant_train = np.concatenate((relevant_X_train, var_y_train), axis=1)
            var_relevant = VAR(var_relevant_train)
            model_relevant = var_relevant.fit(5)
            print('Retrained')
        evaluation_sliding_window_start += 1
    counter += 1

Error for window [0:20] is 0.06459328038185069
Difference: 0.0
Hoeffding global minimum: None. Latest minimum: 0. Sample count: 1
Error for window [1:21] is 0.05575501929622416
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 2
Error for window [2:22] is 0.045913702954866485
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 3
Error for window [3:23] is 0.037465366383084904
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 4
Error for window [4:24] is 0.030624830944354503
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 5
Error for window [5:25] is 0.026085027424480288
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 6
Error for window [6:26] is 0.025503458753919954
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 7
Error for window [7:27] is 0.026590619790493493
Difference: 0.0
Hoeffding global minimum: 0. Latest min

In [23]:
print(root_mean_square_error(y_test, prediction_relevant_VAR))
save_name = EXPERIMENT_NAME + '_Baseline_Drift-aware_VAR_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_relevant_VAR).to_csv(path)

0.15304430265542845


In [24]:
from src.oamts import select_relevant
from src.model import cluster_predictions, compute_cluster_representatives
# Is scaled
relevant_X_train, relevant_X_test = select_relevant(X_train_scaled, y_train, X_test_scaled, 10)
labels, centers = cluster_predictions(relevant_X_train.transpose(), 3)
selected_ts = compute_cluster_representatives(labels, centers, relevant_X_train)

selected_ts_train = relevant_X_train.loc[:, selected_ts]
selected_ts_test = relevant_X_test.loc[:, selected_ts]

ph = PageHinkley(delta=0.005, threshold=0.025)
hoeffding = DataDrift(threshold=0.97)


In [25]:
var_y_train = y_train.to_numpy()
var_y_train = np.reshape(var_y_train, (-1, 1))

In [26]:
var_y_test = y_test.to_numpy()
var_y_test = np.reshape(var_y_test, (-1, 1))

In [27]:
var_relevant_train = np.concatenate((selected_ts_train, var_y_train), axis=1)
var_relevant = VAR(var_relevant_train)
model_relevant = var_relevant.fit(lag)

var_relevant_test = np.concatenate((selected_ts_test, var_y_test), axis=1)

prediction_selected_VAR = []
evaluation_sliding_window_start = 0
counter = 0
for row in var_relevant_test:
    new = np.concatenate((var_relevant_train, np.reshape(row, (1, -1))), axis = 0)
    pred_t = model_relevant.forecast(y=new[-lag:], steps=1)
    prediction_selected_VAR.append(pred_t[:, pred_t.shape[1] - 1][0])
    if(len(prediction_selected_VAR) >= EVALUATION_WINDOW):
        y = y_test[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        y_predicted = prediction_relevant_VAR[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        error = root_mean_square_error(y.to_numpy(), y_predicted)
        print(f'Error for window [{evaluation_sliding_window_start}:{evaluation_sliding_window_start+EVALUATION_WINDOW}] is {error}')
        ph.add_element(error)
        print(f'Difference: {ph.sum - ph.minimum}')

        glob_min = hoeffding.min
        hoeffding.add_element(pd.DataFrame(new[X_train_last_i+counter:,0:(row.shape[0] - 1)]), y)
        print(f'Hoeffding global minimum: {glob_min}. Latest minimum: {hoeffding.min2}. Sample count: {hoeffding.sample_count}')
        if ph.detected_change():
            print('Change in error detected.')
        if hoeffding.detected_change():
            print('Change in data detected')
        if ph.detected_change() | hoeffding.detected_change():
            relevant_X_train, relevant_X_test = select_relevant(X_full_scaled[:X_train_last_i+counter,], y_full[:X_train_last_i+counter,], X_full_scaled[X_train_last_i+counter+1:, ], 10)
            labels, centers = cluster_predictions(relevant_X_train.transpose(), 3)
            selected_ts = compute_cluster_representatives(labels, centers, relevant_X_train)

            selected_ts_train = relevant_X_train.loc[:, selected_ts]
            selected_ts_test = relevant_X_test.loc[:, selected_ts]

            var_y_train = y_full[:X_train_last_i+counter,]
            var_y_train = np.reshape(var_y_train, (-1, 1))
            var_relevant_train = np.concatenate((selected_ts_train, var_y_train), axis=1)
            var_y_test = y_full[X_train_last_i+counter+1:,]
            var_y_test = np.reshape(var_y_test, (-1, 1))
            var_relevant_test = np.concatenate([selected_ts_test, var_y_test], axis=1)
            var_relevant = VAR(var_relevant_train)
            model_relevant = var_relevant.fit(5)
            print('Retrained')
        evaluation_sliding_window_start += 1
    counter += 1

Error for window [0:20] is 0.06459328038185069
Difference: 0.0
Hoeffding global minimum: None. Latest minimum: 0. Sample count: 1
Error for window [1:21] is 0.05575501929622416
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 2
Error for window [2:22] is 0.045913702954866485
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 3
Error for window [3:23] is 0.037465366383084904
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 4
Error for window [4:24] is 0.030624830944354503
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 5
Error for window [5:25] is 0.026085027424480288
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 6
Error for window [6:26] is 0.025503458753919954
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 7
Error for window [7:27] is 0.026590619790493493
Difference: 0.0
Hoeffding global minimum: 0. Latest min

In [28]:
print(root_mean_square_error(y_test, prediction_selected_VAR))
save_name = EXPERIMENT_NAME + '_Baseline_OAMTS_VAR_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_selected_VAR).to_csv(path)

0.10962557338968598


In [None]:
# Hyperparameters
target_index = TARGET_INDEX
subset_size = SUBSET_SIZE
k = K
model = MODEL
sample_subsets = subsample_features(train, target_index, subset_size, k)
train_candidates(train, val, target_index, sample_subsets, model)
VALIDATION_FILENAME = os.path.join(os.getcwd(), 'experiments', EXPERIMENT_NAME, 'validation_predictions_init.csv')
validation_pred = pd.read_csv(VALIDATION_FILENAME)
ensemble = list(map(str, list(range(1, len(sample_subsets)+1))))
validation = pd.concat([val, test], axis=0)

start_index = 0
test_predictions = []
evaluation_sliding_window_start = 0
version = 1
while start_index < (len(test) - WEIGHTS_WINDOW_SIZE):
    end_index = start_index + WEIGHTS_WINDOW_SIZE
    X_window = test.iloc[start_index:end_index, ]
    y_window = test.iloc[start_index:end_index, target_index]
    X = test.iloc[end_index, ]
    print(start_index)
    weights = get_weights(ensemble, X_window, y_window, sample_subsets)
    pred_at_t = final_prediction_ensemble(ensemble, X, sample_subsets, weights)
    test_predictions.append(pred_at_t)
    start_index = start_index + 1

In [None]:
print(root_mean_square_error(y_test, test_predictions))
save_name = EXPERIMENT_NAME + '_Bagging' + MODEL + '_k' + str(K) + '_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(test_predictions).to_csv(path)

In [None]:
# Hyperparameters
target_index = TARGET_INDEX
subset_size = SUBSET_SIZE
k = K
model = MODEL
sample_subsets = subsample_features(train, target_index, subset_size, k)
train_candidates(train, val, target_index, sample_subsets, model)
VALIDATION_FILENAME = os.path.join(os.getcwd(), 'experiments', EXPERIMENT_NAME, 'validation_predictions_init.csv')
validation_pred = pd.read_csv(VALIDATION_FILENAME)
validation = pd.concat([val, test], axis=0)

num_clusters = get_best_num_of_clusters(validation_pred.transpose(), range(2,20))
cluster_result, cluster_centers = cluster_predictions(validation_pred.transpose(), num_clusters)
ensemble = compute_cluster_representatives(cluster_result, cluster_centers, validation_pred)

start_index = 0
test_predictions = []
evaluation_sliding_window_start = 0
version = 1
while start_index < (len(test) - WEIGHTS_WINDOW_SIZE):
    end_index = start_index + WEIGHTS_WINDOW_SIZE
    X_window = test.iloc[start_index:end_index, ]
    y_window = test.iloc[start_index:end_index, target_index]
    X = test.iloc[end_index, ]
    print(start_index)
    weights = get_weights(ensemble, X_window, y_window, sample_subsets)
    pred_at_t = final_prediction_ensemble(ensemble, X, sample_subsets, weights)
    test_predictions.append(pred_at_t)
    start_index = start_index + 1

In [None]:
print(root_mean_square_error(y_test, test_predictions))
save_name = EXPERIMENT_NAME + '_Bagging_Clustering' + MODEL + '_k' + str(K) + '_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(test_predictions).to_csv(path)