In [28]:
# standard
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from math import sqrt
import time

# reading data
import os
import json
from collections import defaultdict

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.fft import rfft, irfft, fftn, ifftn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# measuring ressources
import time
import psutil
import GPUtil
import threading
from memory_profiler import profile

# eFormer
from eFormer.embeddings import Encoding, ProbEncoding, PositionalEncoding
from eFormer.sparse_attention import ProbSparseAttentionModule, DetSparseAttentionModule
from eFormer.loss_function import CRPS, weighted_CRPS
from eFormer.sparse_decoder import DetSparseDecoder, ProbSparseDecoder
from eFormer.Dataloader import TimeSeriesDataProcessor

In [29]:
# set global parameters
hyperparameters = {
    'n_heads': 4,
    'ProbabilisticModel': True,
    # embeddings
    'len_embedding': 64,
    'batch_size': 512,
    # general
    'pred_len': 1,
    'seq_len': 72,
    'patience': 7,
    'dropout': 0.05,
    'learning_rate': 6e-4,
    'WeightDecay': 1e-1,
    'train_epochs': 2,
    'num_workers': 10,
    'step_forecast': 6,
    # benchmarks
    'factor': 1,
    'output_attention': True,
    'd_model': 64,
    'c_out': 6,
    'e_layers': 2,
    'd_layers': 2,
    'activation': 'relu',
    'd_ff': 1,
    'distil': True,
    }

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
class WindTurbineDataProcessor:
    def __init__(self, turbine_directory, dependent_var='Power (kW)'):
        self.directory = f'../data/Windturbinen/{turbine_directory}/'
        self.dependent_var = dependent_var

    def safe_datetime_conversion(self, s):
        try:
            return pd.to_datetime(s)
        except:
            return pd.NaT

    def days_since_last_maintenance(self, row_date, maintenance_dates):
        preceding_maintenance_dates = [date for date in maintenance_dates if date is not None and date <= row_date]
        if not preceding_maintenance_dates:
            return float('NaN')
        last_maintenance_date = max(preceding_maintenance_dates)
        delta = (row_date - last_maintenance_date).days
        return delta

    def check_missing_sequences(self, df):
        sequences = []
        current_sequence = 0
        long_sequence_indices = []
        start_index = None
        
        for i, row in df.iterrows():
            if pd.isnull(row[self.dependent_var]):
                current_sequence += 1
                if start_index is None:
                    start_index = i
            else:
                if current_sequence >= 19:
                    sequence_indices = pd.date_range(start=start_index, periods=current_sequence, freq='10T')
                    long_sequence_indices.extend(sequence_indices)
                    df.loc[sequence_indices, self.dependent_var] = np.inf
                if current_sequence > 0:
                    sequences.append(current_sequence)
                current_sequence = 0
                start_index = None
        
        if current_sequence > 0:
            sequences.append(current_sequence)
            if current_sequence >= 19:
                sequence_indices = pd.date_range(start=start_index, periods=current_sequence, freq='10T')
                long_sequence_indices.extend(sequence_indices)
                df.loc[sequence_indices, self.dependent_var] = np.inf

        df[self.dependent_var] = df[self.dependent_var].replace(np.inf, np.nan).interpolate(method='linear')
        df.drop(long_sequence_indices, inplace=True)
        return df

    def process_and_load_data(self):
        turbine_dataframes = defaultdict(list)
        status_lists = defaultdict(list)

        columns_turbine = ['# Date and time', 'Wind speed (m/s)', 'Power (kW)']
        columns_status = ['Timestamp end', 'IEC category']

        turbine_files = [f for f in os.listdir(self.directory) if f.startswith("Turbine_Data_") and f.endswith(".csv")]
        status_files = [f for f in os.listdir(self.directory) if f.startswith("Status_") and f.endswith(".csv")]

        for filename in tqdm(status_files, desc='Processing status files'):
            turbine_number = filename.split("_")[2]
            filepath = os.path.join(self.directory, filename)
            df = pd.read_csv(filepath, skiprows=9, usecols=columns_status)
            df['Timestamp end'] = df['Timestamp end'].apply(self.safe_datetime_conversion)
            maintenance_dates = df[df['IEC category'] == 'Scheduled Maintenance']['Timestamp end'].unique()
            status_lists[turbine_number].extend(maintenance_dates)

        for filename in tqdm(turbine_files, desc='Processing turbine files'):
            turbine_number = filename.split("_")[3]
            filepath = os.path.join(self.directory, filename)
            df = pd.read_csv(filepath, skiprows=9, usecols=columns_turbine)
            df['# Date and time'] = pd.to_datetime(df['# Date and time'])
            turbine_dataframes[turbine_number].append(df)

        for turbine_number, dfs in turbine_dataframes.items():
            turbine_dataframes[turbine_number] = pd.concat(dfs).sort_values('# Date and time').reset_index(drop=True)
            turbine_dataframes[turbine_number].set_index(pd.to_datetime(turbine_dataframes[turbine_number]['# Date and time']), inplace=True)
            turbine_dataframes[turbine_number].drop(['# Date and time'], axis=1, inplace=True)
            self.check_missing_sequences(turbine_dataframes[turbine_number])

        gc.collect()
        return turbine_dataframes

def process_wind_turbines(turbine_directory, dependent_var):
    processor = WindTurbineDataProcessor(turbine_directory, dependent_var)
    return processor.process_and_load_data()

Kelmarsh_dict = process_wind_turbines('Kelmarsh', 'Power (kW)')
Penmanshiel_dict = process_wind_turbines('Penmanshiel', 'Power (kW)')

Processing status files:   0%|          | 0/1 [00:00<?, ?it/s]

Processing status files: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
Processing turbine files: 100%|██████████| 36/36 [00:13<00:00,  2.59it/s]
Processing status files: 0it [00:00, ?it/s]
Processing turbine files: 100%|██████████| 84/84 [00:33<00:00,  2.50it/s]


In [31]:
class UnifiedDataLoader:
    def __init__(self, df_dict_1, df_dict_2, hyperparameters):
        self.df_dict_1 = df_dict_1
        self.df_dict_2 = df_dict_2
        self.hyperparameters = hyperparameters
        self.train_datasets = []
        self.test_datasets = []
        self.eval_datasets = []

    def process_datasets(self, dataframe_dict):
        for key, df in dataframe_dict.items():
            processor = TimeSeriesDataProcessor(
                dataframe=df,
                forecast=1,
                look_back=self.hyperparameters['seq_len'],
                batch_size=self.hyperparameters['batch_size']
            )
            train_dataset, test_dataset, eval_dataset = processor.prepare_datasets()
            self.train_datasets.append(train_dataset)
            self.test_datasets.append(test_dataset)
            self.eval_datasets.append(eval_dataset)
            # Invoke garbage collection after processing each dataframe
            gc.collect()

    def create_concat_datasets(self):
        self.process_datasets(self.df_dict_1)
        self.process_datasets(self.df_dict_2)

        # Concatenating the datasets
        self.concat_train_dataset = ConcatDataset(self.train_datasets)
        self.concat_test_dataset = ConcatDataset(self.test_datasets)
        self.concat_eval_dataset = ConcatDataset(self.eval_datasets)
        
        # Clear the lists to free up memory
        self.train_datasets.clear()
        self.test_datasets.clear()
        self.eval_datasets.clear()
        
        # Invoke garbage collection after clearing the lists
        gc.collect()

    def create_dataloaders(self):
        self.create_concat_datasets()

        # Creating the data loaders
        self.train_loader = DataLoader(self.concat_train_dataset, batch_size=self.hyperparameters['batch_size'], shuffle=True)
        self.test_loader = DataLoader(self.concat_test_dataset, batch_size=self.hyperparameters['batch_size'], shuffle=False)
        self.eval_loader = DataLoader(self.concat_eval_dataset, batch_size=self.hyperparameters['batch_size'], shuffle=False)
        
        # Invoke garbage collection after dataloaders are created
        gc.collect()

        return self.train_loader, self.test_loader, self.eval_loader


In [32]:
# Instantiate the UnifiedDataLoader class
loader = UnifiedDataLoader(
    df_dict_1=Kelmarsh_dict,
    df_dict_2=Penmanshiel_dict,
    hyperparameters=hyperparameters
)

# Use the new method to get the data loaders
train_loader, test_loader, eval_loader = loader.create_dataloaders()

In [33]:
for batch in train_loader:
    features, labels = batch
    print(features.shape)
    break

torch.Size([512, 144])
