# Fundamental functions for time series modeling using deep learning methods in pytorch

The objective of this notebook is to provaide with the basic building blocks to be able to easily test different Deep Learning approaches on tabular time series using pytorch. The notebook includes a basic on how to use the functions to train a NN model.


In [None]:
#| default_exp time_series_deepl

In [None]:
#| export
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

In [None]:
from pathlib import Path


DATA_PATH = Path("../testing_data")

## Data Preprocessing

We will first open the data

In [None]:
data =pd.read_csv(DATA_PATH / "hydro_example.csv", parse_dates=True, index_col="time")
data.head(5)

Now we will split data into coherent groups

In [None]:
#| export
def split_by_date(
        data: pd.DataFrame, # Input dataframe containing time series data
        val_dates: tuple,   # Tuple of (start_date, end_date) for validation set
        test_dates: tuple   # Tuple of (start_date, end_date) for test set
        ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Split time series data into train, validation and test sets based on date ranges."""
    val_data = data[val_dates[0]:val_dates[1]]
    test_data = data[test_dates[0]:test_dates[1]]
    train_data = data[~(data.index.isin(val_data.index) | data.index.isin(test_data.index))]
    print(f"Approximate data repartition:\n"
          f"Train: {train_data.shape[0]/data.shape[0]:.2%}\n"
          f"Validation: {val_data.shape[0]/data.shape[0]:.2%}\n" 
          f"Test: {test_data.shape[0]/data.shape[0]:.2%}")
    return train_data, val_data, test_data

In [None]:
train, valid, test = split_by_date(data, val_dates=("2012-01-01", "2012-12-31"), test_dates=("2013-01-01", "2014-12-31"))

Now lets define the feature and the target columns and divide data in feature and targets

In [None]:
x_cols = ["smoothed_rain","Q_mgb"]
y_cols = ["Q_obs"]

x_train, y_train = train[x_cols], train[y_cols]
x_valid, y_valid = valid[x_cols], valid[y_cols]
x_test, y_test = test[x_cols], test[y_cols]

Now we will fit the scaler based only on train data. This ensures that:
1. No information from the validation/test data sets leaks to into the scaling process
2. All data is scaled consistently using the same parameters
3. The model sees new data scaled in the same way as it was trained

In [None]:
#| hide
from sklearn.preprocessing import RobustScaler


In [None]:
feature_scaler, target_scaler = RobustScaler(), RobustScaler()
_, _ = feature_scaler.fit_transform(x_train), target_scaler.fit_transform(y_train)

Finally, we'll create a custom dataset class to handle our time series data. This class will create sequences of input features (simulation discharge and rainfall) and target values (observed discharge).

In [None]:
#| export
class HydroDataset(Dataset):
    def __init__(
            self,
            x: pd.DataFrame,
            y: pd.DataFrame, 
            ctx_len: int, 
            pred_len: int = 10, 
            x_transform: callable = None,
            y_transform: callable = None):
        
        if x_transform is None:
            self.features = x.copy()
        else:
            self.features = pd.DataFrame(x_transform(x), columns=x.columns, index=x.index)
        if y_transform is None:
            self.targets = y.copy()
        else:
            self.targets = pd.DataFrame(y_transform(y), columns=y.columns, index=y.index)
        
        self.context_length = ctx_len
        self.prediction_length = pred_len
        self.x_transform = x_transform
        self.y_transform = y_transform
        
    def __len__(self):
        return self.features.shape[0] - self.context_length - self.prediction_length + 1
    
    def __getitem__(self, idx):
        # Get sequence of features
        features = self.features[idx:idx + self.context_length]
        # Get target (next value after sequence)
        targets = self.targets[idx + self.context_length:idx + self.context_length + self.prediction_length]
        return torch.FloatTensor(features.values), torch.FloatTensor(targets.values)
    
    def get_t0(self, idx):
        """Get the t+0 in the sequence from where forecast is made."""
        return self.features.index[idx + self.context_length-1]

We can easily instantiate the dataset as follows

In [None]:
train_dataset = HydroDataset(
    x=x_train,
    y=y_train,
    ctx_len=1,
    pred_len=1,
    x_transform=feature_scaler.transform,
    y_transform=target_scaler.transform
    )


The total training samples are

In [None]:
len(train_dataset)

Is it possible to easly get a training sample as follows:

In [None]:
train_dataset[5]

And also to the the t+0 for any item

In [None]:
train_dataset.get_t0(1000)

## Model example

For the sake of example, we will define the simplest NN we possibly can in PyTorch, which is a simple linear model.

In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        batch_size = x.shape[0]
        x = x.reshape(batch_size, -1)
        out = self.linear(x)
        return out

## Model training

Now we will define a basic learner class to handle the training process. This class will be used to train the model and evaluate its performance.

In [None]:
#| export
class Learner:
    def __init__(self,
                 model: nn.Module, # model to train
                 train_loader: DataLoader, # data loader for training data
                 val_loader: DataLoader, # data loader for validation data
                 criterion: nn.Module = nn.MSELoss(), # loss function to optimize
                 optimizer: torch.optim.Optimizer = torch.optim.Adam, # optimizer class to use for training
                 log_dir: str = None, # directory to save tensorboard logs,
                 verbose: bool = True # whether to print training progress
                 ) -> None:
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.writer = None if log_dir is None else SummaryWriter(log_dir)
        self.verbose = verbose

    def fit(self, lr=0.001, epochs=10):
        optimizer = self.optimizer(self.model.parameters(), lr=lr)
        for epoch in tqdm(range(epochs), desc='Training epochs'):
            # Training
            self.model.train()
            epoch_loss = 0
            for batch_X, batch_y in self.train_loader:
                optimizer.zero_grad()
                outputs = self.model(batch_X)
                loss = self.criterion(outputs, batch_y.squeeze())
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            avg_train_loss = epoch_loss/len(self.train_loader)
            if self.writer is not None:
                self.writer.add_scalar('Training Loss/epoch', avg_train_loss, epoch)

            # Validation
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch_X, batch_y in self.val_loader:
                    val_outputs = self.model(batch_X)
                    val_loss += self.criterion(val_outputs, batch_y.squeeze()).item()
            
            avg_val_loss = val_loss/len(self.val_loader)
            if self.writer is not None:
                self.writer.add_scalar('Validation Loss/epoch', avg_val_loss, epoch)

            if self.verbose:
                print(f'Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    def predict_values(self, dl: DataLoader):
        self.model.eval()
        predictions = []
        targets = []
        with torch.no_grad():
            for batch_X, batch_y in dl:
                batch_pred = self.model(batch_X).cpu().numpy()
                predictions.append(batch_pred)

        predictions = np.vstack(predictions)
        return predictions
    
    def predict(self, dl: DataLoader, inverse_transform: callable=None) -> pd.DataFrame:
        """Make predictions and return them as a pandas DataFrame with proper indexing and column names."""

        predictions = self.predict_values(dl)
        
        # Get the indices from the dataset
        indices = []
        for batch_X, _ in dl:
            # Assuming the dataset has get_t0 method to get the forecast start time
            if hasattr(dl.dataset, 'get_t0'):
                for i in range(len(batch_X)):
                    idx = len(indices)
                    indices.append(dl.dataset.get_t0(idx))
        
        if inverse_transform is None:
            predictions = predictions.reshape(-1, predictions.shape[-1])
        else:
            predictions = inverse_transform(predictions.reshape(-1, predictions.shape[-1]))
        
        n_horizons = predictions.shape[1] if len(predictions.shape) > 1 else 1
        column_names = [f"t+{i+1}" for i in range(0, n_horizons)]
        
        predictions_df = pd.DataFrame(predictions, index=indices, columns=column_names)
        
        return predictions_df

## Model training example

Lets see a simple example of how we can train a neural network.

First we will create our Datasets and Dataloarders based on the data we splitted above

In [None]:
batch_size = 32

context_len=3
prediction_len=2
x_transform=feature_scaler.transform
y_transform=target_scaler.transform

train_dataset = HydroDataset(x=x_train, y=y_train, ctx_len=context_len, pred_len=prediction_len, x_transform=x_transform, y_transform=y_transform)
valid_dataset = HydroDataset(x=x_valid, y=y_valid, ctx_len=context_len, pred_len=prediction_len, x_transform=x_transform, y_transform=y_transform)
test_dataset = HydroDataset(x=x_test, y=y_test, ctx_len=context_len, pred_len=prediction_len, x_transform=x_transform, y_transform=y_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

We can now instantiate the model

In [None]:
model = SimpleNN(input_dim=len(x_cols)*context_len, output_dim=prediction_len)

And finally we can instantiate the learner and fit our data

In [None]:
learner = Learner(model=model, train_loader=train_loader, val_loader=valid_loader)
learner.fit(lr=0.001, epochs=3)

Lets now see the prediction. There are two possible ways. Predicting only the values.

In [None]:
y_pred = learner.predict_values(test_loader)

Getting the prediction with the timestamp and column name. This allow us also to scale back to the original values.

In [None]:
y_pred = learner.predict(test_loader, inverse_transform=target_scaler.inverse_transform)
y_pred.head(4)

We will now add the observation and the mgb simulation so we can plot the result.

In [None]:
y_pred["obs"] = y_test.loc[y_pred.index]
y_pred["mgb"] = x_test["Q_mgb"].loc[y_pred.index]
y_pred.plot()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()