In [17]:
import modules.data_preparation.v1.ETLData as dp
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
from menelaus.data_drift import KdqTreeStreaming
import warnings
warnings.filterwarnings('ignore')

from sktime import datasets
from sktime.forecasting import model_selection
from sktime.utils.plotting import plot_series
import plotly.graph_objects as go
import plotly.express as px

# KdqTree Drift Detection Model

## Data Cleaning:
- input > 0 AND output > 0

## Data:
- efficiency
- efficiency_over_input
- log(efficiency)
- log(efficiency_over_input)

## Notebook Goals:
- KdqTree hyperparameters (window and bootstrapping)
- KdqTree on efficiency
- KdqTree on efficiency_over_input
- Visualisation of Drift points Detected

### Prepare Dataset

In [9]:
dataset = dp.ETLData('../data/dataset1.csv')

In [10]:
efficiency_fn = (lambda e_in, e_out: e_out/e_in)
positive_fn = (lambda x: x>0)

In [11]:
(
dataset.to_numeric(['energy_input_in_mwh', 'energy_output_in_mwh'])
        .filter_by_column_value('energy_input_in_mwh', positive_fn)
        .filter_by_column_value('energy_output_in_mwh', positive_fn)
        .to_timeseries('timestamp_local')
        .compute_column(column_name='efficiency', fn_arg_col_names=['energy_input_in_mwh', 'energy_output_in_mwh'], fn_compute=efficiency_fn)
)

<modules.data_preparation.v1.ETLData.ETLData at 0x7fbb19504a90>

### Drift Detection KdqTree Algorithm

In [18]:
def drift_detection(data, window_size=10, alpha=0.05, bootstrap_samples=100, count_ubound=50):
    
    np.random.seed(1)
    det = KdqTreeStreaming(window_size=window_size, alpha=alpha, bootstrap_samples=bootstrap_samples, count_ubound=count_ubound)
    
    status = pd.DataFrame(columns=["timestamp", "energy_out_over_in_square", "drift_detected"])

    plot_data = {}
    
    for i in range(len(data)):
        det.update(data.iloc[[i]])
        #status.loc[i] = [i, data.iloc[i, 0], data.iloc[i, 1], det.drift_state]
        status.loc[i] = [data.index[i], data.iloc[i, 0], det.drift_state]
        if det.drift_state is not None:
            # capture the visualization data
            plot_data[i] = det.to_plotly_dataframe()
            
    return plot_data, status

### Test KdqTree Algorithm Hyperparameters on Efficiency

In [19]:
hyperparameters = []
drift_detection_status = []
for window_size in [10, 30, 90, 100, 150, 300]:
    for bootstrap_samples in [500]:
        plot_data, status = drift_detection(dataset.get_data()['efficiency'], window_size=window_size, bootstrap_samples=bootstrap_samples)
        drift_detection_status.append((window_size, bootstrap_samples, status))

IndexingError: Too many indexers