In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/eFormer/src

/content/drive/MyDrive/eFormer/src


In [None]:
!pip install GPUtil
!pip install memory_profiler

# standard
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from math import sqrt
import time

# reading data
import os
import json
from collections import defaultdict

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
from torch.nn import DataParallel
import torch.nn.functional as F
from torch.fft import rfft, irfft, fftn, ifftn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torch.optim import AdamW

# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# resources
import time
import psutil
import GPUtil
import threading
import gc

# eFormer
from eFormer.embeddings import Encoding, ProbEncoding, PositionalEncoding
from eFormer.sparse_attention import ProbSparseAttentionModule, DetSparseAttentionModule
from eFormer.loss_function import CRPS, weighted_CRPS
from eFormer.sparse_decoder import DetSparseDecoder, ProbSparseDecoder
from eFormer.Dataloader import TimeSeriesDataProcessor

# transformer Benchmarks
from Benchmarks.Benchmarks import VanillaTransformer, Informer

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7394 sha256=7e86948fb23b16bf66671ad5706ecb4cc7b38bdd61fdbda434156bf48a5e283c
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
class WindTurbineDataProcessor:
    def __init__(self, turbine_directory, dependent_var='Power (kW)'):
        self.directory = f'../data/Windturbinen/{turbine_directory}/'
        self.dependent_var = dependent_var

    def safe_datetime_conversion(self, s):
        try:
            return pd.to_datetime(s)
        except:
            return pd.NaT

    def days_since_last_maintenance(self, row_date, maintenance_dates):
        preceding_maintenance_dates = [date for date in maintenance_dates if date is not None and date <= row_date]
        if not preceding_maintenance_dates:
            return float('NaN')
        last_maintenance_date = max(preceding_maintenance_dates)
        delta = (row_date - last_maintenance_date).days
        return delta

    def check_missing_sequences(self, df):
        sequences = []
        current_sequence = 0
        long_sequence_indices = []
        start_index = None

        for i, row in df.iterrows():
            if pd.isnull(row[self.dependent_var]):
                current_sequence += 1
                if start_index is None:
                    start_index = i
            else:
                if current_sequence >= 19:
                    sequence_indices = pd.date_range(start=start_index, periods=current_sequence, freq='10T')
                    long_sequence_indices.extend(sequence_indices)
                    df.loc[sequence_indices, self.dependent_var] = np.inf
                if current_sequence > 0:
                    sequences.append(current_sequence)
                current_sequence = 0
                start_index = None

        if current_sequence > 0:
            sequences.append(current_sequence)
            if current_sequence >= 19:
                sequence_indices = pd.date_range(start=start_index, periods=current_sequence, freq='10T')
                long_sequence_indices.extend(sequence_indices)
                df.loc[sequence_indices, self.dependent_var] = np.inf

        df[self.dependent_var] = df[self.dependent_var].replace(np.inf, np.nan).interpolate(method='linear')
        df.drop(long_sequence_indices, inplace=True)
        return df

    def process_and_load_data(self):
        turbine_dataframes = defaultdict(list)
        status_lists = defaultdict(list)

        columns_turbine = ['# Date and time', 'Wind speed (m/s)', 'Power (kW)']
        columns_status = ['Timestamp end', 'IEC category']

        turbine_files = [f for f in os.listdir(self.directory) if f.startswith("Turbine_Data_") and f.endswith(".csv")]
        status_files = [f for f in os.listdir(self.directory) if f.startswith("Status_") and f.endswith(".csv")]

        for filename in tqdm(status_files, desc='Processing status files'):
            turbine_number = filename.split("_")[2]
            filepath = os.path.join(self.directory, filename)
            df = pd.read_csv(filepath, skiprows=9, usecols=columns_status)
            df['Timestamp end'] = df['Timestamp end'].apply(self.safe_datetime_conversion)
            maintenance_dates = df[df['IEC category'] == 'Scheduled Maintenance']['Timestamp end'].unique()
            status_lists[turbine_number].extend(maintenance_dates)

        for filename in tqdm(turbine_files, desc='Processing turbine files'):
            turbine_number = filename.split("_")[3]
            filepath = os.path.join(self.directory, filename)
            df = pd.read_csv(filepath, skiprows=9, usecols=columns_turbine)
            df['# Date and time'] = pd.to_datetime(df['# Date and time'])
            turbine_dataframes[turbine_number].append(df)

        for turbine_number, dfs in turbine_dataframes.items():
            turbine_dataframes[turbine_number] = pd.concat(dfs).sort_values('# Date and time').reset_index(drop=True)
            turbine_dataframes[turbine_number].set_index(pd.to_datetime(turbine_dataframes[turbine_number]['# Date and time']), inplace=True)
            turbine_dataframes[turbine_number].drop(['# Date and time'], axis=1, inplace=True)
            self.check_missing_sequences(turbine_dataframes[turbine_number])

        gc.collect()
        return turbine_dataframes

def process_wind_turbines(turbine_directory, dependent_var):
    processor = WindTurbineDataProcessor(turbine_directory, dependent_var)
    return processor.process_and_load_data()

Kelmarsh_dict = process_wind_turbines('Kelmarsh', 'Power (kW)')
Penmanshiel_dict = process_wind_turbines('Penmanshiel', 'Power (kW)')

Processing status files: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Processing turbine files: 100%|██████████| 36/36 [02:11<00:00,  3.66s/it]
Processing status files: 0it [00:00, ?it/s]
Processing turbine files: 100%|██████████| 84/84 [05:11<00:00,  3.70s/it]


# Hyperparameters

In [None]:
# set global parameters
hyperparameters = {
    'n_heads': 4,
    'ProbabilisticModel': False,
    # embeddings
    'len_embedding': 32,
    'batch_size': 512,
    # general
    'pred_len': 1,
    'seq_len': 72,
    'patience': 7,
    'dropout': 0.05,
    'learning_rate': 6e-4,
    'WeightDecay': 1e-1,
    'train_epochs': 100,
    'num_workers': 10,
    'step_forecast': 6,
    # benchmarks
    'factor': 1,
    'output_attention': True,
    'd_model': 32,
    'c_out': 6,
    'e_layers': 2,
    'd_layers': 2,
    'activation': 'relu',
    'd_ff': 1,
    'distil': True,
    }


if torch.cuda.is_available():
    device = torch.device("cuda:0")  # Use the first GPU available
    print("Running on GPU")
else:
    device = torch.device("cpu")  # Fallback to CPU if no GPU is available
    print("Running on CPU")

Running on GPU


# Experiment Setup

In [None]:
def check_system_conditions():
    # Get CPU usage for each core
    cpu_percent = round(psutil.cpu_percent(), 4)

    # Get memory information
    memory_info = psutil.virtual_memory()
    memory_used_gb = round(memory_info.used / (1024 ** 3), 4)

    # Get GPU information
    try:
      gpu_info = GPUtil.getGPUs()[0]
      gpu_memory_used_gb = round(gpu_info.memoryUsed / 1024, 4)
    except IndexError:
      # If no GPU is found, set variables to None
      gpu_memory_used_gb = None

    # Collect data in a dictionary
    comp_usage = {
        'CPU Usage': cpu_percent,
        'Memory Usage (GB)': memory_used_gb,
        'GPU Usage (GB)': gpu_memory_used_gb
    }

    return comp_usage

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            #print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
            #if self.verbose:
                #print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f})')
            self.val_loss_min = val_loss

In [None]:
class UnifiedDataLoader:
    def __init__(self, df_dict_1, df_dict_2, hyperparameters):
        self.df_dict_1 = df_dict_1
        self.df_dict_2 = df_dict_2
        self.hyperparameters = hyperparameters
        self.train_datasets = []
        self.test_datasets = []
        self.eval_datasets = []

    def process_datasets(self, dataframe_dict):
        for key, df in dataframe_dict.items():
            processor = TimeSeriesDataProcessor(
                dataframe=df,
                forecast=1,
                look_back=self.hyperparameters['seq_len'],
                batch_size=self.hyperparameters['batch_size']
            )
            train_dataset, test_dataset, eval_dataset = processor.prepare_datasets()
            self.train_datasets.append(train_dataset)
            self.test_datasets.append(test_dataset)
            self.eval_datasets.append(eval_dataset)
            # Invoke garbage collection after processing each dataframe
            gc.collect()

    def create_concat_datasets(self):
        self.process_datasets(self.df_dict_1)
        self.process_datasets(self.df_dict_2)

        # Concatenating the datasets
        self.concat_train_dataset = ConcatDataset(self.train_datasets)
        self.concat_test_dataset = ConcatDataset(self.test_datasets)
        self.concat_eval_dataset = ConcatDataset(self.eval_datasets)

        # Clear the lists to free up memory
        self.train_datasets.clear()
        self.test_datasets.clear()
        self.eval_datasets.clear()

        # Invoke garbage collection after clearing the lists
        gc.collect()

    def create_dataloaders(self):
        self.create_concat_datasets()

        # Creating the data loaders
        self.train_loader = DataLoader(self.concat_train_dataset, batch_size=self.hyperparameters['batch_size'], shuffle=True)
        self.test_loader = DataLoader(self.concat_test_dataset, batch_size=self.hyperparameters['batch_size'], shuffle=False)
        self.eval_loader = DataLoader(self.concat_eval_dataset, batch_size=self.hyperparameters['batch_size'], shuffle=False)

        # Invoke garbage collection after dataloaders are created
        gc.collect()

        return self.train_loader, self.test_loader, self.eval_loader


## Informer

In [None]:
class Config:
    def __init__(self, dictionary):
        for key, value in dictionary.items():
            setattr(self, key, value)

In [None]:
def multi_step_Informer(model, model_2, initial_input, steps):
    """
    Perform a recurrent multi-step forecast using the provided model.

    Parameters:
    - model: The trained model for prediction.
    - initial_input: The initial input to start the forecast. This should be a tensor
                     of shape (batch_size, seq_len * 2) based on your model definition.
    - steps: The number of steps to forecast ahead.

    Returns:
    - A list containing the forecasted values for each step ahead.
    """
    forecasted_steps = []
    current_input = initial_input

    with torch.no_grad():
        for _ in range(steps):
            # split tensor
            first_half = current_input[:,:(current_input.shape[1]//2)]
            second_half = current_input[:,(current_input.shape[1]//2):]
            # forecast wind
            first_half_pred, _ = model_2(first_half)
            updated_first_half = torch.cat((first_half[:, 1:], first_half_pred), dim=1)
            # update input
            current_input = torch.cat((updated_first_half, second_half), dim=1)
            # forecast output
            prediction, weights = model(current_input)
            updated_second_half = torch.cat((second_half[:, 1:], prediction), dim=1)

            # create new tensor
            current_input = torch.cat((updated_first_half, updated_second_half), dim=1)

            # store values
            forecasted_steps.append(prediction)

    return torch.Tensor(forecasted_steps[-1]), weights

In [None]:
def run_experiment(loss_function_name, embedding_size, hyperparameters):
    # Adjust loss function based on the input argument
    if loss_function_name == 'CRPS':
        loss_fn = CRPS()  # Assuming CRPS is a defined class/function
    elif loss_function_name == 'MSELoss':
        loss_fn = nn.MSELoss()
    elif loss_function_name == 'L1Loss':
        loss_fn = nn.L1Loss()
    elif loss_function_name == 'CrossEntropyLoss':
        loss_fn = nn.CrossEntropyLoss()
    else:
        raise ValueError(f"Unknown loss function: {loss_function_name}")

    hyperparameters['step_forecast'] = step_forecast

    if step_forecast == 1:
        hyperparameters['seq_len'] = 72
    elif step_forecast == 6:
        hyperparameters['seq_len'] = 72
    elif step_forecast == 72:
        hyperparameters['seq_len'] = 288
    elif step_forecast == 144:
        hyperparameters['seq_len'] = 576
    else:
        raise ValueError(f"Unknown forecast horizon: {step_forecast}")

    # initiate model etc
    early_stopping = EarlyStopping(
        patience=hyperparameters['patience'],
        verbose=True
        )
    model = Informer(
        configs = Config(hyperparameters),
        seq_len = (hyperparameters['seq_len'] * 2)
        ).to(device)
    model_2 = Informer(
        configs = Config(hyperparameters),
        seq_len = (hyperparameters['seq_len'])
        ).to(device)
    optimizer = AdamW(
        params = model.parameters(),
        lr=hyperparameters['learning_rate'],
        weight_decay=hyperparameters['WeightDecay']
        )

    num_epochs = hyperparameters['train_epochs']

    # function to run monitoring in a separate thread
    def monitor_system_usage(every_n_seconds=1, keep_running=lambda: True, results_list=[]):
        while keep_running():
            comp_usage = check_system_conditions()
            results_list.append(comp_usage)
            time.sleep(every_n_seconds)

    # Initialize a list to store the results
    system_usage_results = []
    training_time = []

    # Define a lambda function to control the monitoring loop
    keep_monitoring = lambda: keep_monitoring_flag
    keep_monitoring_flag = True # Initialize the flag before starting training

    # Start the monitoring thread
    monitor_thread = threading.Thread(target=monitor_system_usage, args=(5, keep_monitoring, system_usage_results))
    monitor_thread.start()

    # Instantiate the UnifiedDataLoader class
    loader = UnifiedDataLoader(
        df_dict_1=Kelmarsh_dict,
        df_dict_2=Penmanshiel_dict,
        hyperparameters=hyperparameters)

    # Use the new method to get the data loaders
    train_loader, test_loader, eval_loader = loader.create_dataloaders()

    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        model.train()
        train_losses = []
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions, crps_weights = model(features)
            loss = loss_fn(predictions, labels)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        train_loss_avg = np.mean(train_losses)
        print(f"Epoch {epoch + 1} / {num_epochs} with Loss: {round(train_loss_avg, 6)}")

         # Validation phase
        model.eval()
        validation_losses = []
        with torch.no_grad():
            for features, labels in eval_loader:
                features, labels = features.to(device), labels.to(device)
                predictions, crps_weights = model(features)
                val_loss = loss_fn(predictions, labels)
                validation_losses.append(val_loss.item())

        val_loss_avg = np.mean(validation_losses)

        epoch_end_time = time.time()
        epoch_duration = epoch_end_time - epoch_start_time
        training_time.append(epoch_duration)
        #print(f"Epoch Duration: \n {round(epoch_duration, 4)}s")

        early_stopping(val_loss_avg)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    # After training is done, set the flag to False to stop the monitoring thread
    keep_monitoring_flag = False
    monitor_thread.join()  # Wait for the monitoring thread to finish

    # Convert the results list to a DataFrame
    system_usage = pd.DataFrame(system_usage_results)

    print(f"start of test phase: {time.ctime()}")

    # test data set
    test_losses = []
    predictions_collected = []
    groundtruth_collected = []

    if torch.cuda.device_count() > 1:
        model = DataParallel(model)
        model_2 = DataParallel(model_2)

    model.eval()
    model_2.eval()
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)

            # Perform the multi-step forecast
            predictions, weights = multi_step_Informer(model, model_2, features, hyperparameters['step_forecast'])
            test_loss = loss_fn(predictions[:-hyperparameters['step_forecast']], labels[hyperparameters['step_forecast']:])
            test_losses.append(test_loss.item())

            predictions_collected.extend(predictions.tolist())
            groundtruth_collected.extend(labels.tolist())

        test_loss_avg = np.mean(test_losses)

    print(f"\n Model test loss: {round(test_loss_avg,6)}")

    # Save Results
    df_eval = pd.DataFrame({
        'Predictions':predictions_collected,
        'GroundTruth':groundtruth_collected,
        'TestLoss':round(test_loss_avg,6)})
    df_epoch = pd.DataFrame({
        'Epoch Duration':training_time})

    import re
    def re_loss_fn(s):
        pattern = r'(?:nn\.)?(\w+)\(\)'
        match = re.search(pattern, s)
        if match:
            return match.group(1)

    system_usage.to_csv(f"../data/df_SystemUsage_Informer_emb{hyperparameters['len_embedding']}_pred{hyperparameters['step_forecast']}_{re_loss_fn(str(loss_fn))}.csv")
    df_eval.to_csv(f"../data/df_eval_Informer_emb{hyperparameters['len_embedding']}_pred{hyperparameters['step_forecast']}_{re_loss_fn(str(loss_fn))}.csv")
    df_epoch.to_csv(f"../data/df_epoch_Informer_emb{hyperparameters['len_embedding']}_pred{hyperparameters['step_forecast']}_{re_loss_fn(str(loss_fn))}.csv")

loss_functions = ['L1Loss', 'MSELoss']
step_forecasts = [72]

for loss_function in loss_functions:
    for step_forecast in step_forecasts:
        print(f"{loss_function, step_forecast}")
        run_experiment(loss_function, step_forecast, hyperparameters)

from google.colab import runtime
runtime.unassign()

('L1Loss', 72)
Epoch 1 / 100 with Loss: 438.902588
Epoch 2 / 100 with Loss: 170.61066
Epoch 3 / 100 with Loss: 106.747448
Epoch 4 / 100 with Loss: 99.33632
Epoch 5 / 100 with Loss: 98.113184
Epoch 6 / 100 with Loss: 98.267474
Epoch 7 / 100 with Loss: 98.822313
Epoch 8 / 100 with Loss: 99.545271
Epoch 9 / 100 with Loss: 100.216402
Epoch 10 / 100 with Loss: 100.886574
Epoch 11 / 100 with Loss: 101.459352
Epoch 12 / 100 with Loss: 101.893814
Early stopping
start of test phase: Thu Apr 25 14:28:22 2024
