# Libraries

In [39]:
# standard
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from math import sqrt

# reading data
import os
import json
from collections import defaultdict

# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.model_selection import train_test_split


%store -r Kelmarsh_df Penmanshiel_df test_df

# Training Data Set

2 approaches
- naive n steps ahead forecast
- recurrent shifting 1 step ahead forecast

## recurrent forecast

12 hours look back to predict next value

In [5]:
# global variables
look_back = 72

In [4]:
Kelmarsh_df['1'].head()

Unnamed: 0,# Date and time,Wind speed (m/s),Long Term Wind (m/s),Energy Export (kWh)
0,2016-01-03 00:00:00,,7.1,
1,2016-01-03 00:10:00,,7.1,
2,2016-01-03 00:20:00,,7.1,
3,2016-01-03 00:30:00,,7.1,
4,2016-01-03 00:40:00,,7.1,


In [43]:
df = Kelmarsh_df['1'].set_index('# Date and time')
df.index.names = [None]
df = df.drop(['Long Term Wind (m/s)'], axis=1)

In [44]:
df.head()


Unnamed: 0,Wind speed (m/s),Energy Export (kWh)
2016-01-03 00:00:00,,
2016-01-03 00:10:00,,
2016-01-03 00:20:00,,
2016-01-03 00:30:00,,
2016-01-03 00:40:00,,


### shifting data

In [47]:
def shifted_data(data: pd.DataFrame, forecast: int, look_back: int):
    shifts = range(forecast, look_back + forecast)
    variables = data.columns
    
    # List to store DataFrames for each shifted version
    shifted_columns = []
    
    # Create shifted versions of each column
    for column in variables:
        for i in shifts:
            shifted_df = data[[column]].shift(i)  # Shift and keep as DataFrame
            shifted_df.rename(columns={column: f"{column} (lag {i})"}, inplace=True)
            shifted_columns.append(shifted_df)
    
    # Concatenate all shifted columns with the original DataFrame at once
    data_shifted = pd.concat([data] + shifted_columns, axis=1)
    
    # Drop rows with NaN values that were created due to shifting
    data_shifted.dropna(inplace=True)
    
    return data_shifted


In [48]:
s_df = shifted_data(data=df, forecast=1, look_back=72)
s_df.head()

Unnamed: 0,Wind speed (m/s),Energy Export (kWh),Wind speed (m/s) (lag 1),Wind speed (m/s) (lag 2),Wind speed (m/s) (lag 3),Wind speed (m/s) (lag 4),Wind speed (m/s) (lag 5),Wind speed (m/s) (lag 6),Wind speed (m/s) (lag 7),Wind speed (m/s) (lag 8),...,Energy Export (kWh) (lag 63),Energy Export (kWh) (lag 64),Energy Export (kWh) (lag 65),Energy Export (kWh) (lag 66),Energy Export (kWh) (lag 67),Energy Export (kWh) (lag 68),Energy Export (kWh) (lag 69),Energy Export (kWh) (lag 70),Energy Export (kWh) (lag 71),Energy Export (kWh) (lag 72)
2016-01-24 00:10:00,9.58,120.0,9.07,8.97,9.38,10.4,10.73,10.55,10.22,10.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-24 00:20:00,9.46,90.0,9.58,9.07,8.97,9.38,10.4,10.73,10.55,10.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-24 00:30:00,9.49,90.0,9.46,9.58,9.07,8.97,9.38,10.4,10.73,10.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-24 00:40:00,9.16,120.0,9.49,9.46,9.58,9.07,8.97,9.38,10.4,10.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-24 00:50:00,9.14,90.0,9.16,9.49,9.46,9.58,9.07,8.97,9.38,10.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### train-test split

In [49]:
X = s_df.drop(['Energy Export (kWh)', 'Wind speed (m/s)'], axis=1)  # Features
y = s_df['Energy Export (kWh)']  # Target

# First split into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.7, random_state=42)

# Split the remaining dataset into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)

In [50]:
X_train.shape

(191209, 144)