# Libraries

In [62]:
# Standard
import pandas as pd

# Machine Learning
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

%store -r Kelmarsh_df Penmanshiel_df test_df

# Training Data Set

2 approaches
- naive n steps ahead forecast
- recurrent shifting 1 step ahead forecast

## recurrent forecast

12 hours look back to predict next value

In [53]:
# global variables
look_back = 72

In [54]:
df = Kelmarsh_df['1'].set_index('# Date and time')
df.index.names = [None]
df = df.drop(['Long Term Wind (m/s)'], axis=1)

### shifting data

In [55]:
def shifted_data(data: pd.DataFrame, forecast: int, look_back: int):
    shifts = range(forecast, look_back + forecast)
    variables = data.columns
    
    # List to store DataFrames for each shifted version
    shifted_columns = []
    
    # Create shifted versions of each column
    for column in variables:
        for i in shifts:
            shifted_df = data[[column]].shift(i)  # Shift and keep as DataFrame
            shifted_df.rename(columns={column: f"{column} (lag {i})"}, inplace=True)
            shifted_columns.append(shifted_df)
    
    # Concatenate all shifted columns with the original DataFrame at once
    data_shifted = pd.concat([data] + shifted_columns, axis=1)
    
    # Drop rows with NaN values that were created due to shifting
    data_shifted.dropna(inplace=True)
    
    return data_shifted

s_df = shifted_data(data=df, forecast=1, look_back=72).drop(['Wind speed (m/s)'], axis=1)

### train-test split

In [56]:
# First split into training and remaining dataset
df_train, df_rem = train_test_split(s_df, train_size=0.7, random_state=42)

# Split the remaining dataset into validation and test sets
df_eval, df_test = train_test_split(df_rem, test_size=0.5, random_state=42)

### Data Loader

In [59]:
class TimeSeriesDataset(Dataset):
    def __init__(self, dataframe):
        # Assuming the first column is the target variable
        self.labels = dataframe.iloc[:, 0].values
        self.features = dataframe.iloc[:, 1:].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Convert data to PyTorch tensors
        features = torch.tensor(self.features[idx], dtype=torch.float)
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        return features, labels

# Assuming df_train, df_test, df_eval are your datasets
train_dataset = TimeSeriesDataset(df_train)
test_dataset = TimeSeriesDataset(df_test)
eval_dataset = TimeSeriesDataset(df_eval)

# Create DataLoaders
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Shuffle the data for training, typically not needed for test/eval
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)


In [63]:
class TimeSeriesDataProcessor:
    def __init__(self, dataframe, forecast, look_back, batch_size=64, train_size=0.7, test_size=0.5, random_state=42):
        self.dataframe = dataframe
        self.forecast = forecast
        self.look_back = look_back
        self.batch_size = batch_size
        self.train_size = train_size
        self.test_size = test_size
        self.random_state = random_state

    def shifted_data(self):
        data = self.dataframe
        forecast = self.forecast
        look_back = self.look_back
        shifts = range(forecast, look_back + forecast)
        variables = data.columns
        
        shifted_columns = []
        for column in variables:
            for i in shifts:
                shifted_df = data[[column]].shift(i)
                shifted_df.rename(columns={column: f"{column} (lag {i})"}, inplace=True)
                shifted_columns.append(shifted_df)
        
        data_shifted = pd.concat([data] + shifted_columns, axis=1)
        data_shifted.dropna(inplace=True)
        
        return data_shifted

    def prepare_datasets(self):
        s_df = self.shifted_data().drop(['Wind speed (m/s)'], axis=1)
        
        # Splitting dataset
        df_train, df_rem = train_test_split(s_df, train_size=self.train_size, random_state=self.random_state)
        df_eval, df_test = train_test_split(df_rem, test_size=self.test_size, random_state=self.random_state)

        # Wrapping datasets
        self.train_dataset = TimeSeriesDataset(df_train)
        self.test_dataset = TimeSeriesDataset(df_test)
        self.eval_dataset = TimeSeriesDataset(df_eval)

    def create_dataloaders(self):
        self.prepare_datasets()

        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)
        self.eval_loader = DataLoader(self.eval_dataset, batch_size=self.batch_size, shuffle=False)

        return self.train_loader, self.test_loader, self.eval_loader

class TimeSeriesDataset(Dataset):
    def __init__(self, dataframe):
        self.labels = dataframe.iloc[:, 0].values
        self.features = dataframe.iloc[:, 1:].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float)
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        return features, labels


In [64]:
# Assuming `df` is your initial DataFrame
processor = TimeSeriesDataProcessor(dataframe=df, forecast=1, look_back=72, batch_size=64)
train_loader, test_loader, eval_loader = processor.create_dataloaders()
