In [None]:
# Only run this cell when in Google Colab
! git init
! git remote add origin 
! git fetch
! git checkout -t origin/main

In [47]:
from src.process import read_and_split_data, subsample_features
from src.model import train_candidates, cluster_predictions, compute_cluster_representatives, root_mean_square_error, plot_clustering, get_best_num_of_clusters, save_validation_predictions
from src.drift import PageHinkley, DataDrift
from src.predict import predict_n_steps_for_ensemble, predict_one_step_for_ensemble, get_weights, final_prediction_ensemble
import os
import pandas as pd
from config import EXPERIMENT_NAME, VALIDATION_WINDOW_SIZE, WEIGHTS_WINDOW_SIZE, EVALUATION_WINDOW, TARGET_INDEX, SUBSET_SIZE, K, MODEL

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
! wget -q --show-progress -P data/{EXPERIMENT_NAME}/ https://www.dropbox.com/s/ga07hldz2rizkuu/NEW-DATA-1.T15-PREPROCESSED.csv

In [48]:
train, val, test = read_and_split_data(EXPERIMENT_NAME + '/NEW-DATA-1.T15-PREPROCESSED.csv', val_size=VALIDATION_WINDOW_SIZE)

In [49]:
# Hyperparameters
target_index = TARGET_INDEX
subset_size = SUBSET_SIZE
k = K
model = MODEL

In [50]:
sample_subsets = subsample_features(train, target_index, subset_size, k)

In [141]:
train_candidates(train, val, target_index, sample_subsets, model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Validation error for model 1 is 0.10140538985129192.




INFO:tensorflow:Assets written to: ram://141d9500-d783-45d6-9aff-2fdf63bb3f19/assets


INFO:tensorflow:Assets written to: ram://141d9500-d783-45d6-9aff-2fdf63bb3f19/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Validation error for model 2 is 0.046742548150598646.




INFO:tensorflow:Assets written to: ram://f2be89ac-908c-49d3-9666-a8dd4f6538eb/assets


INFO:tensorflow:Assets written to: ram://f2be89ac-908c-49d3-9666-a8dd4f6538eb/assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 

In [52]:
VALIDATION_FILENAME = os.path.join(os.getcwd(), 'experiments', EXPERIMENT_NAME, 'validation_predictions_init.csv')

In [53]:
validation_pred = pd.read_csv(VALIDATION_FILENAME)

In [92]:
num_clusters = get_best_num_of_clusters(validation_pred.transpose())
cluster_result, cluster_centers = cluster_predictions(validation_pred.transpose(), num_clusters)
ensemble = compute_cluster_representatives(cluster_result, cluster_centers, validation_pred)

Determined number of clusters: 9.0


In [97]:
ph = PageHinkley(delta=0.005, threshold=0.025)
hoeffding = DataDrift(threshold=0.97)

In [98]:
all_model_indices = list(map(str, list(range(1, len(sample_subsets)+1))))

In [99]:
from src.model import save_validation_predictions

In [100]:
def compute_ensemble(X, running_version_name):
    val_predictions = predict_n_steps_for_ensemble(all_model_indices, X, sample_subsets)
    val_predictions = pd.DataFrame(val_predictions)
    val_predictions.columns = range(1, len(val_predictions.columns)+1)

    cluster_result, cluster_centers = cluster_predictions(val_predictions, num_clusters)
    ensemble = compute_cluster_representatives(cluster_result, cluster_centers, val_predictions)

    return list(map(str, ensemble))

In [101]:
validation = pd.concat([val, test], axis=0)

In [102]:
start_index = 0
test_predictions = []
evaluation_sliding_window_start = 0
version = 1
while start_index < (len(test) - WEIGHTS_WINDOW_SIZE):
    end_index = start_index + WEIGHTS_WINDOW_SIZE
    X_window = test.iloc[start_index:end_index, ]
    y_window = test.iloc[start_index:end_index, target_index]
    X = test.iloc[end_index, ]
    print(start_index)
    weights = get_weights(ensemble, X_window, y_window, sample_subsets)
    pred_at_t = final_prediction_ensemble(ensemble, X, sample_subsets, weights)
    test_predictions.append(pred_at_t)
    if(len(test_predictions) >= EVALUATION_WINDOW):
        X = test.iloc[WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start:WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start+EVALUATION_WINDOW, test.columns!=test.columns[target_index]]
        y = test.iloc[WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start:WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start+EVALUATION_WINDOW, target_index]
        y_predicted = test_predictions[evaluation_sliding_window_start:evaluation_sliding_window_start+EVALUATION_WINDOW]
        error = root_mean_square_error(y.to_numpy(), y_predicted)
        print(f'Error for window [{WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start}:{WEIGHTS_WINDOW_SIZE+evaluation_sliding_window_start+EVALUATION_WINDOW}] is {error}')
        ph.add_element(error)
        print(f'Difference: {ph.sum - ph.minimum}')

        glob_min = hoeffding.min
        hoeffding.add_element(X, y)
        print(f'Hoeffding global minimum: {glob_min}. Latest minimum: {hoeffding.min2}. Sample count: {hoeffding.sample_count}')
        if ph.detected_change():
            print('Change in error detected.')
        if hoeffding.detected_change():
            print('Change in data detected')
        if ph.detected_change() | hoeffding.detected_change():
            X_val = validation.iloc[end_index:end_index + VALIDATION_WINDOW_SIZE, ]
            ensemble = compute_ensemble(X_val, version)
            version += 1
        evaluation_sliding_window_start += 1
    start_index = start_index + 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Error for window [8:28] is 0.0021491570942494187
Difference: 0.0
Hoeffding global minimum: None. Latest minimum: 0.44506516482639563. Sample count: 1
20
Error for window [9:29] is 0.0024175227060603635
Difference: 0.0
Hoeffding global minimum: 0.44506516482639563. Latest minimum: 0.43212374465918246. Sample count: 2
21
Error for window [10:30] is 0.002481800522189959
Difference: 0.0
Hoeffding global minimum: 0.44506516482639563. Latest minimum: 0.37966363522280117. Sample count: 3
22
Error for window [11:31] is 0.002426847872226473
Difference: 0.0
Hoeffding global minimum: 0.44506516482639563. Latest minimum: 0.25926848743362557. Sample count: 4
Change in data detected


TypeError: 'function' object cannot be interpreted as an integer

In [86]:
root_mean_square_error(test.iloc[WEIGHTS_WINDOW_SIZE:, target_index].to_numpy(), test_predictions)

0.01919480659591487

In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_name = EXPERIMENT_NAME + '_' + MODEL + '_k' + K + '_results.csv'
path = F'/content/drive/MyDrive/Colab Notebooks/oafbs_results/{save_name}'
test_predictions.to_csv(path)