In [None]:
# Only run this cell when in Google Colab
! git init
! git remote add origin https://github.com/hannamykula/oafbs.git
! git fetch
! git checkout -t origin/main

In [None]:
! pip install tslearn
! pip install scikit-multiflow

In [149]:
from src.process import read_and_split_data, subsample_features
from src.model import train_candidates, cluster_predictions, compute_cluster_representatives, root_mean_square_error, plot_clustering, get_best_num_of_clusters, save_validation_predictions
from src.drift import PageHinkley, DataDrift
from src.predict import predict_n_steps_for_ensemble, predict_one_step_for_ensemble, get_weights, final_prediction_ensemble
import os
import pandas as pd
from config import EXPERIMENT_NAME, VALIDATION_WINDOW_SIZE, WEIGHTS_WINDOW_SIZE, EVALUATION_WINDOW, TARGET_INDEX, SUBSET_SIZE, K, MODEL

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
! wget -q --show-progress -P data/{EXPERIMENT_NAME}/ https://www.dropbox.com/s/ga07hldz2rizkuu/NEW-DATA-1.T15-PREPROCESSED.csv

In [150]:
train, val, test = read_and_split_data(EXPERIMENT_NAME + '/NEW-DATA-1.T15-PREPROCESSED.csv', val_size=VALIDATION_WINDOW_SIZE)

In [151]:
# Hyperparameters
target_index = TARGET_INDEX
subset_size = SUBSET_SIZE
k = K
model = MODEL

In [152]:
sample_subsets = subsample_features(train, target_index, subset_size, k)

In [153]:
train_candidates(train, val, target_index, sample_subsets, model)

Validation error for model 1 is 0.03315993508413894.
Validation error for model 2 is 0.035828810072875854.
Validation error for model 3 is 0.03580723946696871.
Validation error for model 4 is 0.007776070689147423.
Validation error for model 5 is 0.022095195883565538.
Validation error for model 6 is 0.01586920853677343.
Validation error for model 7 is 0.009109451261225658.
Validation error for model 8 is 0.02626317780374889.
Validation error for model 9 is 0.016199688278780263.
Validation error for model 10 is 0.00777607068914739.


In [155]:
VALIDATION_FILENAME = os.path.join(os.getcwd(), 'experiments', EXPERIMENT_NAME, 'validation_predictions_init.csv')

In [156]:
validation_pred = pd.read_csv(VALIDATION_FILENAME)

In [169]:
num_clusters = get_best_num_of_clusters(validation_pred.transpose(), range(2,10))
cluster_result, cluster_centers = cluster_predictions(validation_pred.transpose(), num_clusters)
ensemble = compute_cluster_representatives(cluster_result, cluster_centers, validation_pred)

In [158]:
ph = PageHinkley(delta=0.005, threshold=0.025)
hoeffding = DataDrift(threshold=0.97)

In [159]:
all_model_indices = list(map(str, list(range(1, len(sample_subsets)+1))))

In [167]:
def compute_ensemble(X, running_version_name):
    val_predictions = predict_n_steps_for_ensemble(all_model_indices, X, sample_subsets)
    val_predictions = pd.DataFrame(val_predictions)
    val_predictions.columns = range(1, len(val_predictions.columns)+1)

    cluster_result, cluster_centers = cluster_predictions(val_predictions, num_clusters)
    ensemble = compute_cluster_representatives(cluster_result, cluster_centers, val_predictions.transpose())

    return list(map(str, ensemble))

In [161]:
validation = pd.concat([val, test], axis=0)

In [168]:
start_index = 0
test_predictions = []
evaluation_sliding_window_start = 0
version = 1
while start_index < (len(test) - WEIGHTS_WINDOW_SIZE):
    end_index = start_index + WEIGHTS_WINDOW_SIZE
    X_window = test.iloc[start_index:end_index, ]
    y_window = test.iloc[start_index:end_index, target_index]
    X = test.iloc[end_index, ]
    print(start_index)
    weights = get_weights(ensemble, X_window, y_window, sample_subsets)
    pred_at_t = final_prediction_ensemble(ensemble, X, sample_subsets, weights)
    test_predictions.append(pred_at_t)
    if(len(test_predictions) >= EVALUATION_WINDOW):
        X = test.iloc[WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start:WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start+EVALUATION_WINDOW, test.columns!=test.columns[target_index]]
        y = test.iloc[WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start:WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start+EVALUATION_WINDOW, target_index]
        y_predicted = test_predictions[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        error = root_mean_square_error(y.to_numpy(), y_predicted)
        print(f'Error for window [{WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start}:{WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start+EVALUATION_WINDOW}] is {error}')
        ph.add_element(error)
        print(f'Difference: {ph.sum - ph.minimum}')

        glob_min = hoeffding.min
        hoeffding.add_element(X, y)
        print(f'Hoeffding global minimum: {glob_min}. Latest minimum: {hoeffding.min2}. Sample count: {hoeffding.sample_count}')
        if ph.detected_change():
            print('Change in error detected.')
        if hoeffding.detected_change():
            print('Change in data detected')
        if ph.detected_change() | hoeffding.detected_change():
            X_val = validation.iloc[end_index:end_index + VALIDATION_WINDOW_SIZE, ]
            ensemble = compute_ensemble(X_val, version)
            version += 1
        evaluation_sliding_window_start += 1
    start_index = start_index + 1

0


ZeroDivisionError: division by zero

In [86]:
root_mean_square_error(test.iloc[WEIGHTS_WINDOW_SIZE:, target_index].to_numpy(), test_predictions)

0.01919480659591487

In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_name = EXPERIMENT_NAME + '_' + MODEL + '_k' + K + '_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
test_predictions.to_csv(path)