In [None]:
# Only run this cell when in Google Colab
! git init
! git remote add origin https://github.com/hannamykula/ob-mts.git
! git fetch
! git checkout -t origin/main

In [1]:
from src.process import read_and_split_data, subsample_features
from src.model import train_candidates, cluster_predictions, compute_cluster_representatives, root_mean_square_error, plot_clustering, get_best_num_of_clusters, save_validation_predictions
from src.drift import PageHinkley, DataDrift
from src.predict import predict_n_steps_for_ensemble, predict_one_step_for_ensemble, get_weights, final_prediction_ensemble
import os
import pandas as pd
from config import EXPERIMENT_NAME, VALIDATION_WINDOW_SIZE, WEIGHTS_WINDOW_SIZE, EVALUATION_WINDOW, TARGET_INDEX, SUBSET_SIZE, K, MODEL
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from statsmodels.tsa.api import VAR
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from tslearn.utils import to_time_series
import numpy as np

%load_ext autoreload
%autoreload 2



In [None]:
! wget -q --show-progress -P data/{EXPERIMENT_NAME}/ https://www.dropbox.com/s/ga07hldz2rizkuu/NEW-DATA-1.T15-PREPROCESSED.csv

In [2]:
train, val, test = read_and_split_data(EXPERIMENT_NAME + '/NEW-DATA-1.T15-PREPROCESSED.csv', val_size=VALIDATION_WINDOW_SIZE)

In [3]:
# Hyperparameters
target_index = TARGET_INDEX

In [4]:
y_train = train.iloc[:, target_index]
X_train = train.drop(train.columns[target_index], axis=1)
X_test = test.drop(test.columns[target_index], axis=1)
X_test = X_test.iloc[WEIGHTS_WINDOW_SIZE:, :]
y_test = test.iloc[WEIGHTS_WINDOW_SIZE:, target_index]

X_scaler = MinMaxScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.fit_transform(X_test)

In [5]:
model_RF = RandomForestRegressor(max_depth=2, random_state=0)
model_RF.fit(X_train_scaled, y_train)
predictions_RF = model_RF.predict(X_test_scaled)

In [6]:
root_mean_square_error(y_test, predictions_RF)

0.021006391451659192

In [10]:
save_name = EXPERIMENT_NAME + '_Baseline_RF_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(predictions_RF).to_csv(path)

In [7]:
model_GBM = GradientBoostingRegressor()
model_GBM.fit(X_train_scaled, y_train)
predictions_GBM = model_GBM.predict(X_test_scaled)

In [8]:
root_mean_square_error(y_test, predictions_GBM)

0.031993608099984354

In [17]:
save_name = EXPERIMENT_NAME + '_Baseline_GBM_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(predictions_GBM).to_csv(path)

In [47]:
lag = 5
var_y_train = y_train.to_numpy()
var_y_train = np.reshape(var_y_train, (-1, 1))
var_train = np.concatenate((X_train_scaled, var_y_train), axis=1)
var = VAR(var_train)
model = var.fit(lag)

var_y_test = y_test.to_numpy()
var_y_test = np.reshape(var_y_test, (-1, 1))
var_test = np.concatenate((X_test_scaled, var_y_test), axis=1)

prediction_VAR = []
for row in var_test:
    new = np.concatenate((var_train, np.reshape(row, (1, -1))), axis = 0)
    pred_t = model.forecast(y=new[-lag:], steps=1)
    prediction_VAR.append(pred_t[:, pred_t.shape[1] - 1][0])

print(root_mean_square_error(y_test, prediction_VAR))

0.037039440830899106


In [48]:
save_name = EXPERIMENT_NAME + '_Baseline_VAR_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_VAR).to_csv(path)

In [13]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
prediction_lasso = lasso.predict(X_test_scaled)

In [16]:
root_mean_square_error(y_test, prediction_lasso)

0.015416762049896858

In [15]:
save_name = EXPERIMENT_NAME + '_Baseline_Lasso_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_lasso).to_csv(path)

In [104]:
from src.oamts import select_relevant

# Is scaled
relevant_X_train, relevant_X_test = select_relevant(X_train_scaled, y_train, X_test_scaled, 10)
ph = PageHinkley(delta=0.005, threshold=0.025)
hoeffding = DataDrift(threshold=0.97)
X_full_scaled = np.concatenate((X_train_scaled, X_test_scaled), axis = 0)
y_full = np.concatenate((y_train.to_numpy(), y_test.to_numpy()), axis = 0)

X_train_last_i = X_train_scaled.shape[0] - 1


In [108]:
var_relevant_train = np.concatenate((relevant_X_train, var_y_train), axis=1)
var_relevant = VAR(var_relevant_train)
model_relevant = var_relevant.fit(lag)

var_relevant_test = np.concatenate((relevant_X_test, var_y_test), axis=1)

prediction_relevant_VAR = []
evaluation_sliding_window_start = 0
counter = 0
for row in var_relevant_test:
    new = np.concatenate((var_relevant_train, np.reshape(row, (1, -1))), axis = 0)
    pred_t = model_relevant.forecast(y=new[-lag:], steps=1)
    prediction_relevant_VAR.append(pred_t[:, pred_t.shape[1] - 1][0])
    if(len(prediction_relevant_VAR) >= EVALUATION_WINDOW):
        y = y_test[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        y_predicted = prediction_relevant_VAR[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        error = root_mean_square_error(y.to_numpy(), y_predicted)
        print(f'Error for window [{evaluation_sliding_window_start}:{evaluation_sliding_window_start+EVALUATION_WINDOW}] is {error}')
        ph.add_element(error)
        print(f'Difference: {ph.sum - ph.minimum}')

        glob_min = hoeffding.min
        hoeffding.add_element(pd.DataFrame(new[X_train_last_i+counter:,0:(row.shape[0] - 1)]), y)
        print(f'Hoeffding global minimum: {glob_min}. Latest minimum: {hoeffding.min2}. Sample count: {hoeffding.sample_count}')
        if ph.detected_change():
            print('Change in error detected.')
        if hoeffding.detected_change():
            print('Change in data detected')
        if ph.detected_change() | hoeffding.detected_change():
            relevant_X_train, relevant_X_test = select_relevant(X_full_scaled[:X_train_last_i+counter,], y_full[:X_train_last_i+counter,], X_full_scaled[X_train_last_i+counter+1:, ], 10)
            var_y_train = y_full[:X_train_last_i+counter,]
            var_y_train = np.reshape(var_y_train, (-1, 1))
            var_relevant_train = np.concatenate((relevant_X_train, var_y_train), axis=1)
            var_relevant = VAR(var_relevant_train)
            model_relevant = var_relevant.fit(5)
            print('Retrained')
        evaluation_sliding_window_start += 1
    counter += 1

Error for window [0:20] is 0.005942775684815141
Difference: 0.0
Hoeffding global minimum: None. Latest minimum: 0. Sample count: 1
Error for window [1:21] is 0.005471509662480653
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 2
Error for window [2:22] is 0.005040217394383233
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 3
Error for window [3:23] is 0.004605146676355193
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 4
Error for window [4:24] is 0.004171169890967894
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 5
Error for window [5:25] is 0.0037660153755282137
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 6
Error for window [6:26] is 0.0033554130163701316
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 7
Error for window [7:27] is 0.0030081100070798554
Difference: 0.0
Hoeffding global minimum: 0. Lates

In [109]:
print(root_mean_square_error(y_test, prediction_relevant_VAR))
save_name = EXPERIMENT_NAME + '_Baseline_Drift-aware_VAR_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_relevant_VAR).to_csv(path)

0.008655665161746809

In [121]:
from src.oamts import select_relevant
from src.model import cluster_predictions, compute_cluster_representatives
# Is scaled
relevant_X_train, relevant_X_test = select_relevant(X_train_scaled, y_train, X_test_scaled, 10)
labels, centers = cluster_predictions(relevant_X_train.transpose(), 5)
selected_ts = compute_cluster_representatives(labels, centers, relevant_X_train)

selected_ts_train = relevant_X_train.loc[:, selected_ts]
selected_ts_test = relevant_X_test.loc[:, selected_ts]

ph = PageHinkley(delta=0.005, threshold=0.025)
hoeffding = DataDrift(threshold=0.97)


In [123]:
var_y_train = y_train.to_numpy()
var_y_train = np.reshape(var_y_train, (-1, 1))

In [126]:
var_y_test = y_test.to_numpy()
var_y_test = np.reshape(var_y_test, (-1, 1))

In [127]:
var_relevant_train = np.concatenate((selected_ts_train, var_y_train), axis=1)
var_relevant = VAR(var_relevant_train)
model_relevant = var_relevant.fit(lag)

var_relevant_test = np.concatenate((selected_ts_test, var_y_test), axis=1)

prediction_selected_VAR = []
evaluation_sliding_window_start = 0
counter = 0
for row in var_relevant_test:
    new = np.concatenate((var_relevant_train, np.reshape(row, (1, -1))), axis = 0)
    pred_t = model_relevant.forecast(y=new[-lag:], steps=1)
    prediction_selected_VAR.append(pred_t[:, pred_t.shape[1] - 1][0])
    if(len(prediction_selected_VAR) >= EVALUATION_WINDOW):
        y = y_test[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        y_predicted = prediction_relevant_VAR[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        error = root_mean_square_error(y.to_numpy(), y_predicted)
        print(f'Error for window [{evaluation_sliding_window_start}:{evaluation_sliding_window_start+EVALUATION_WINDOW}] is {error}')
        ph.add_element(error)
        print(f'Difference: {ph.sum - ph.minimum}')

        glob_min = hoeffding.min
        hoeffding.add_element(pd.DataFrame(new[X_train_last_i+counter:,0:(row.shape[0] - 1)]), y)
        print(f'Hoeffding global minimum: {glob_min}. Latest minimum: {hoeffding.min2}. Sample count: {hoeffding.sample_count}')
        if ph.detected_change():
            print('Change in error detected.')
        if hoeffding.detected_change():
            print('Change in data detected')
        if ph.detected_change() | hoeffding.detected_change():
            relevant_X_train, relevant_X_test = select_relevant(X_full_scaled[:X_train_last_i+counter,], y_full[:X_train_last_i+counter,], X_full_scaled[X_train_last_i+counter+1:, ], 10)
            labels, centers = cluster_predictions(relevant_X_train.transpose(), 5)
            selected_ts = compute_cluster_representatives(labels, centers, relevant_X_train)

            selected_ts_train = relevant_X_train.loc[:, selected_ts]
            selected_ts_test = relevant_X_test.loc[:, selected_ts]

            var_y_train = y_full[:X_train_last_i+counter,]
            var_y_train = np.reshape(var_y_train, (-1, 1))
            var_relevant_train = np.concatenate((selected_ts_train, var_y_train), axis=1)
            var_relevant = VAR(var_relevant_train)
            model_relevant = var_relevant.fit(5)
            print('Retrained')
        evaluation_sliding_window_start += 1
    counter += 1

Error for window [0:20] is 0.005942775684815141
Difference: 0.0
Hoeffding global minimum: None. Latest minimum: 0. Sample count: 1
Error for window [1:21] is 0.005471509662480653
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 2
Error for window [2:22] is 0.005040217394383233
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 3
Error for window [3:23] is 0.004605146676355193
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 4
Error for window [4:24] is 0.004171169890967894
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 5
Error for window [5:25] is 0.0037660153755282137
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 6
Error for window [6:26] is 0.0033554130163701316
Difference: 0.0
Hoeffding global minimum: 0. Latest minimum: 0. Sample count: 7
Error for window [7:27] is 0.0030081100070798554
Difference: 0.0
Hoeffding global minimum: 0. Lates

In [128]:
print(root_mean_square_error(y_test, prediction_selected_VAR))
save_name = EXPERIMENT_NAME + '_Baseline_OAMTS_VAR_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
# path = os.path.join('results\sml10-dataset', save_name)

pd.DataFrame(prediction_selected_VAR).to_csv(path)

0.027245661334215035
