In [3]:
# libraries
import pandas as pd
import numpy as np
import os
from statsmodels.tsa.stattools import acf, adfuller
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.svm import SVR
from tqdm import tqdm
from collections import defaultdict

# read in data

data downloaded from:

 - https://zenodo.org/record/5946808#.ZGNpddbP23I
 - https://zenodo.org/record/5841834#.ZGNpTNbP23J

### Wind data

##### Status data

In [4]:
# read in files
turbine_status = pd.read_csv('../data/Windturbinen/Kelmarsh/Status_Kelmarsh_1_2016-01-03_-_2017-01-01_228.csv',
                             skiprows = 9)

In [5]:
print(turbine_status.columns)
turbine_status.head()

Index(['Timestamp start', 'Timestamp end', 'Duration', 'Status', 'Code',
       'Message', 'Comment', 'Service contract category', 'IEC category'],
      dtype='object')


Unnamed: 0,Timestamp start,Timestamp end,Duration,Status,Code,Message,Comment,Service contract category,IEC category
0,2016-01-14 19:28:03,2016-01-23 14:36:32,211:08:29,Stop,111,Emergency stop nacelle,,Emergency stop switch (Nacelle) (11),Forced outage
1,2016-01-14 19:28:03,2016-01-14 19:38:03,00:10:00,Warning,5720,Brake accumulator defect,,Warnings (27),
2,2016-01-14 19:28:05,2016-01-23 11:27:46,207:59:41,Informational,3835,Cable panel breaker open,,Warnings (27),
3,2016-01-14 19:28:05,2016-01-23 11:27:46,207:59:41,Informational,3830,Supply circuit breaker earthed,,Warnings (27),Full Performance
4,2016-01-14 19:28:05,2016-01-23 14:09:18,210:41:13,Warning,3870,Overload transformer fan outlet air,,Warnings (27),Full Performance


Status data is not relevant to this thesis !

##### Turbine Data

In [7]:
def reading_Windturbines (turbine_directory):
    # Columns to keep
    columns_to_keep = [
        '# Date and time',
        'Wind speed (m/s)',
        'Long Term Wind (m/s)',
        'Energy Export (kWh)'
    ]

    # Directory containing CSV files
    directory = f'../data/Windturbinen/{turbine_directory}/'

    # Dictionary to hold DataFrames for each turbine
    turbine_dataframes = defaultdict(list)

    # Get a list of CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.startswith(f"Turbine_Data_{turbine_directory}_") and f.endswith(".csv")]

    # Iterate through the files in the directory with a tqdm progress bar
    for filename in tqdm(csv_files, desc='Processing files'):
        # Extract the turbine number from the filename
        turbine_number = filename.split("_")[3]  # Assuming the number is in this position

        # Read the CSV file, skipping the first 9 rows
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath, skiprows=9, usecols=columns_to_keep)

        # Convert the "Date and time" column to datetime
        df['# Date and time'] = pd.to_datetime(df['# Date and time'])

        # Append the DataFrame to the appropriate turbine's list
        turbine_dataframes[turbine_number].append(df)

    # Concatenate the DataFrames for each turbine
    for turbine_number, dfs in turbine_dataframes.items():
        turbine_dataframes[turbine_number] = pd.concat(dfs)
        turbine_dataframes[turbine_number].sort_values('# Date and time', inplace=True)

    # Print the keys for the dictionary
    print("\n dictionary keys:")
    print(turbine_dataframes.keys())
    # print descriptive stuff for exemplary key
    print('\n Information for exemplary key:')
    first_key = list(turbine_dataframes.keys())[0]
    print('shape')
    print(turbine_dataframes[first_key].shape)
    print('\n missing values')
    print(turbine_dataframes[first_key].isna().sum())

    return turbine_dataframes

In [8]:
Kelmarsh_df = reading_Windturbines('Kelmarsh')

Processing files: 100%|██████████| 36/36 [00:18<00:00,  1.94it/s]


 dictionary keys:
dict_keys(['6', '4', '1', '3', '2', '5'])

 Information for exemplary key:
shape
(288864, 4)

 missing values
# Date and time            0
Wind speed (m/s)        9223
Long Term Wind (m/s)       0
Energy Export (kWh)     5072
dtype: int64





In [9]:
Penmanshiel_df = reading_Windturbines('Penmanshiel')

Processing files: 100%|██████████| 84/84 [00:45<00:00,  1.84it/s]


 dictionary keys:
dict_keys(['07', '08', '02', '05', '06', '15', '10', '14', '01', '04', '11', '12', '13', '09'])

 Information for exemplary key:
shape
(267014, 4)

 missing values
# Date and time            0
Wind speed (m/s)        5881
Long Term Wind (m/s)       0
Energy Export (kWh)     1032
dtype: int64





### FlexGuide Data

In [10]:
# merge files
ENIT_df = pd.concat([ENIT_2022_1, ENIT_2022_2, ENIT_2023_1])

# rename columns
ENIT_df.columns = ENIT_df.columns.str.replace('Wirkarbeit (Bezug) ', '').str.strip()

# add residual columns
ENIT_df['1.8 - Residual'] = ENIT_df.iloc[:, 1] - ENIT_df.iloc[:, 2:8].sum(axis=1)
ENIT_df['2.7 - Residual'] = ENIT_df.iloc[:, 9] - ENIT_df.iloc[:, 10:15].sum(axis=1)
ENIT_df['0.1 - Residual'] = ENIT_df.iloc[:, 16] - ENIT_df.iloc[:, 1] - ENIT_df.iloc[:, 9]

NameError: name 'ENIT_2022_1' is not defined

In [None]:
# Calculate the mean and standard deviation for each column
mean = ENIT_df.iloc[:, 1:15].mean()
std_dev = ENIT_df.iloc[:, 1:15].std()

# Identify outliers using twice the standard deviation
outliers = (ENIT_df.iloc[:, 1:15] < (mean - 2 * std_dev)) | (ENIT_df.iloc[:, 1:15] > (mean + 2 * std_dev))

# Print the number of True values in each column
print("Number of outliers in each column:")
print(outliers.sum())

# Get the row and column indices of the True values
true_values_indices = outliers.where(outliers).stack().index

# Print the row and column indices of the True values
print("\nIndices of outliers:")
for row, col in true_values_indices:
    print(f"Row: {row}, Column: {col}")


Number of outliers in each column:
1.0 - Trafo 1 [Wh]                                   0
1.1 - Neubau, Wohnhaus, Holzplatz [Wh]               0
1.2 - Halle 3/1 Absaugung [Wh]                       0
1.3 - Halle 4/2 Maschinensaal/Tischfertigung [Wh]    0
1.4 - Halle 2/2 Verwaltung, Entwicklung [Wh]         0
1.5 - Halle 4/5 Lackieranlage [Wh]                   0
1.6 - Halle 1/6 Hausmeister [Wh]                     0
1.7 - Halle 3/3 Kompressor, Stuhlmontage [Wh]        0
2.0 - Trafo 2 [Wh]                                   0
2.1 - Halle 4/5 Schrankfertigung [Wh]                0
2.2 - Halle 2/4 Rilsan [Wh]                          0
2.3 - Halle 4/5 Stahlstuhl [Wh]                      0
2.4 - Halle 4/1 Schichtholz [Wh]                     0
2.5 - Halle 4/1 Absaugung Schichtholz [Wh]           0
dtype: int64

Indices of outliers:


In [None]:
print(ENIT_df.shape)
ENIT_df.head()

In [None]:
ENIT_df['Übergabezähler [Wh]'].unique()

array([    0.  , 20000.  , 40000.  , ..., 66666.67, 86666.67, 73333.33])

In [None]:
# NA values
print(ENIT_df.isna().sum())

In [None]:
# data as dict with each company as a key
FlexGuideData = {
    'MechTron': ENIT_df
}

# pre-processing pipeline

In [11]:
def outlier_detection(data, dependent_var, independent_var):
    # only keep demand and one other variable
    df = data[[f'{dependent_var}', f'{independent_var}']]

    # check for outliers
    # Calculate the mean and standard deviation for each column
    mean = df.mean()
    std_dev = df.std()

    # Identify outliers using twice the standard deviation
    outliers = (df < (mean - 3 * std_dev)) | (df > (mean + 3 * std_dev))

    # Print the number of True values in each column
    print("Number of outliers in each column:")
    print(outliers.sum())

    # Get the row and column indices of the True values
    true_values_indices = outliers.where(outliers).stack().index

    # Print the row and column indices of the True values
    print("\nIndices of outliers:")
    for row, col in true_values_indices:
        print(f"Row: {row}, Column: {col}")

    return df

In [12]:
# column names as strings
Kelmarsh_1 = outlier_detection(Kelmarsh_df['1'], 'Energy Export (kWh)', 'Long Term Wind (m/s)')

Number of outliers in each column:
Energy Export (kWh)     6
Long Term Wind (m/s)    0
dtype: int64

Indices of outliers:
Row: 27407, Column: Energy Export (kWh)
Row: 10456, Column: Energy Export (kWh)
Row: 17788, Column: Energy Export (kWh)
Row: 24557, Column: Energy Export (kWh)
Row: 26026, Column: Energy Export (kWh)
Row: 37678, Column: Energy Export (kWh)


In [23]:
%store Kelmarsh_df Penmanshiel_df

Stored 'Kelmarsh_df' (defaultdict)
Stored 'Penmanshiel_df' (defaultdict)


# Modelle

### data preprocessing for data loader and models

In [13]:
def model_preprocess(data, demand, temperature, n_lags):

    # Scale the input data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data[[f'{demand}', f'{temperature}']])

    # Create lag features
    def create_lag_features(data, n_lags):
        lag_features = []
        for i in range(1, n_lags + 1):
            lag_features.append(data.shift(i).rename(columns=lambda x: f'{x}_lag_{i}'))
        return pd.concat(lag_features, axis=1)

    lag_features = create_lag_features(pd.DataFrame(scaled_data, columns=[f'{demand}', f'{temperature}']), n_lags)
    data = pd.DataFrame(scaled_data, columns=[f'{demand}', f'{temperature}']).join(lag_features).dropna().values

    # Train-test split (80:20)
    train_size = int(len(data) * 0.8)
    train, test = data[:train_size], data[train_size:]

    # Separate features and target variable
    X_train, y_train = train[:, 1:], train[:, 0]
    X_test, y_test = test[:, 1:], test[:, 0]

    # Reshape input
    X_train = X_train.reshape(X_train.shape[0], 1, -1)
    X_test = X_test.reshape(X_test.shape[0], 1, -1)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float)
    y_train = torch.tensor(y_train, dtype=torch.float)
    X_test = torch.tensor(X_test, dtype=torch.float)
    y_test = torch.tensor(y_test, dtype=torch.float)

    return X_train, y_train, X_test, y_test, scaler

### linear data preprocess

In [14]:
def model_linear_data(data, demand, temperature, n_lags):

    data[f"{demand}_lag_{n_lags}"] = data[f'{demand}'].shift(n_lags)
    data = data.dropna()

    # Prepare data for modeling
    X = data[[f'{temperature}', f"{demand}_lag_{n_lags}"]]
    y = data[f"{demand}"]

    # Train-test split (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

### data loader

In [15]:
def model_dataloader(X_train, y_train, batch_size):
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float))
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

    return train_dataloader

### hyperparameters

In [16]:
def model_hyperparameter(X_train):
    hyperparameters_dict = dict({
        'input_dim' : X_train.shape[2],
        'hidden_dim' : 60,
        'num_layers' : 1,
        'output_dim' : 1,
        'learning_rate' : 0.001,
        'num_epochs' : 5,
        'batch_size' : 32,
        'device' : "cpu",
        'nhead' : 4
    })

    return hyperparameters_dict

### errors

In [17]:
def model_errors (results_df, model, y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)

    results_df.loc[f'{model}', 'MSE'] = '{:.6f}'.format(mse)
    results_df.loc[f'{model}', 'RMSE'] = '{:.6f}'.format(rmse)
    results_df.loc[f'{model}', 'MAE'] = '{:.6f}'.format(mae)
    results_df.loc[f'{model}', 'MAPE'] = '{:.6f}'.format(mape)

    return results_df

### linear regression

In [18]:
def model_linear(X_train, y_train, X_test):
    # make linear prediction
    linear_reg = LinearRegression()
    linear_reg.fit(X_train, y_train)
    y_pred_linear = linear_reg.predict(X_test)

    return y_pred_linear

### LSTM

In [19]:
def model_LSTM(hyperparameters, dataloader, scaler, X_test, y_test):
    # Create the LSTM model
    class LSTMModel(nn.Module):
        def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
            super(LSTMModel, self).__init__()
            self.hidden_dim = hidden_dim
            self.num_layers = num_layers
            self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
            self.fc = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim)#.to(hyperparameters['device'])
            c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim)#.to(hyperparameters['device'])
            with torch.backends.cudnn.flags(enabled=False):
                out, _ = self.lstm(x, (h0, c0))
            out = self.fc(out[:, -1, :])
            return out

    # Initialize the model, loss function, and optimizer
    model = LSTMModel(
        hyperparameters['input_dim'],
        hyperparameters['hidden_dim'],
        hyperparameters['num_layers'],
        hyperparameters['output_dim']
        ).to(hyperparameters['device'])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=hyperparameters['learning_rate'])

    # Train the model
    model.train()
    for epoch in range(hyperparameters['num_epochs']):
        for x_batch, y_batch in dataloader:
            # move to GPU
            x_batch = x_batch.to(hyperparameters['device'])
            y_batch = y_batch.to(hyperparameters['device'])

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            y_pred = model(x_batch)

            # Calculate the loss
            loss = criterion(y_pred.squeeze(), y_batch)

            # Backward pass
            loss.backward()

            # Update the weights
            optimizer.step()

    # Make predictions
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test).detach().numpy().squeeze()

    # Invert scaling for test data
    test_unscaled = np.column_stack((y_test.numpy().reshape(-1, 1), X_test.numpy().squeeze()[:, :1]))
    test_unscaled = scaler.inverse_transform(test_unscaled)
    y_test_unscaled = test_unscaled[:, 0]

    # Invert scaling for predictions
    y_pred_scaled = np.column_stack((y_pred.reshape(-1, 1), X_test.numpy().squeeze()[:, :1]))
    y_pred_unscaled = scaler.inverse_transform(y_pred_scaled)[:, 0]

    return y_pred_unscaled, y_test_unscaled

### Transformer (Basic)

In [20]:
def model_Transformer(hyperparameters, dataloader, scaler, X_test, y_test):
    # Transformer Model
    class TransformerModel(nn.Module):
        def __init__(self, input_dim, d_model, nhead, num_layers, output_dim):
            super(TransformerModel, self).__init__()
            self.embedding = nn.Linear(input_dim, d_model)
            self.transformer_layer = nn.TransformerEncoderLayer(d_model, nhead)
            self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers)
            self.fc = nn.Linear(d_model, output_dim)

        def forward(self, x):
            x = self.embedding(x)
            x = self.transformer(x)
            x = self.fc(x)
            return x

    # Create the transformer model
    model = TransformerModel(
                input_dim = hyperparameters['input_dim'],
                d_model = hyperparameters['hidden_dim'],
                nhead = hyperparameters['nhead'],
                num_layers = hyperparameters['num_layers'],
                output_dim = hyperparameters['output_dim']
                ).to(hyperparameters['device'])

    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=hyperparameters['learning_rate'])

    # Training loop
    for epoch in range(hyperparameters['num_epochs']):
        for x_batch, y_batch in dataloader:
            # Move data to the device (CPU or GPU)
            x_batch = x_batch.to(hyperparameters['device'])
            y_batch = y_batch.to(hyperparameters['device'])

            #print(y_batch.shape)
            #print(x_batch.shape)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            y_pred = model(x_batch)

            #print(y_pred.shape)

            # Calculate the loss
            loss = criterion(y_pred.squeeze(), y_batch)

            # Backward pass
            loss.backward()

            # Update the weights
            optimizer.step()

    # Make predictions
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test).detach().cpu().numpy().squeeze()

    # Invert scaling for test data
    test_unscaled = np.column_stack((y_test.cpu().numpy().reshape(-1, 1), X_test.cpu().numpy().squeeze()[:, :1]))
    test_unscaled = scaler.inverse_transform(test_unscaled)
    y_test_unscaled = test_unscaled[:, 0]

    # Invert scaling for predictions
    y_pred_scaled = np.column_stack((y_pred.reshape(-1, 1), X_test.cpu().numpy().squeeze()[:, :1]))
    y_pred_unscaled = scaler.inverse_transform(y_pred_scaled)[:, 0]

    return y_pred_unscaled, y_test_unscaled

# complete product

In [21]:
from sklearn import linear_model
def final_model(data, demand, temperature, n_lags):
    # create results df
    results_df = pd.DataFrame(
        index=['linear regression', 'LSTM', 'Transformer 1'],
        columns=['MSE', 'RMSE', 'MAE', 'MAPE']
        )

    # preprocess data
    X_train, y_train, X_test, y_test, scaler = model_preprocess(data, demand, temperature, n_lags)
    linear_x_train, linear_x_test, linear_y_train, linear_y_test = model_linear_data(data, demand, temperature, n_lags)

    # hyperparameters
    hyperparameters = model_hyperparameter(X_train)

    # dataloader
    dataloader = model_dataloader(X_train, y_train, hyperparameters['batch_size'])

    # predictions
    linear_pred = model_linear(linear_x_train, linear_y_train, linear_x_test)
    LSTM_pred, LSTM_test = model_LSTM(hyperparameters, dataloader, scaler, X_test, y_test)
    #Transformer_pred, Transformer_test = model_Transformer(hyperparameters, dataloader, scaler, X_test, y_test)

    # errors
    model_errors(results_df, 'linear regression', linear_y_test, linear_pred)
    model_errors(results_df, 'LSTM', LSTM_test, LSTM_pred)
    #model_errors(results_df, 'Transformer 1', Transformer_test, Transformer_pred)

    return results_df

In [22]:
final_model(Kelmarsh_1, 'Energy Export (kWh)', 'Long Term Wind (m/s)', 24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"{demand}_lag_{n_lags}"] = data[f'{demand}'].shift(n_lags)
  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float))
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


Unnamed: 0,MSE,RMSE,MAE,MAPE
linear regression,4652.08783,68.206215,49.674727,inf
LSTM,1201.843506,34.667614,23.500977,inf
Transformer 1,,,,


# Test Area