In [11]:
# important  imports
import os
import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [12]:
# Get number of cpus to use for faster parallelized data loading
num_cpus = os.cpu_count()
print(num_cpus, 'CPUs available')

16 CPUs available


In [13]:
############# UTILITIES ############

# Define Dataset
class EBCDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample, label



# Data loader
def grab_data(num_cpus=1):
    """Loads data from data_dir

    Args:
        data_dir (str): Directory to store data
        num_cpus (int, optional): Number of cpus that should be used to 
            preprocess data. Defaults to 1.

    Returns:
        Returns datasets as Dataset class for Göttingen forest and Bothanic Garden
    """
    # Load the data from 2023 and 2024 into pandas
    cwd = os.getcwd()

    data2023_BoG = pd.read_csv(os.path.join( cwd, 'data_2023/Fluxes_H_LE_CO2/BoG/FBG_fluxes_30min_20230101_20230801.csv' ))
    data2023_GoeWa = pd.read_csv(os.path.join( cwd, 'data_2023/Fluxes_H_LE_CO2/GoeWa/GoeW_fluxes_30min_20230101_20230801.csv' ))
    data2024_BoG = pd.read_csv(os.path.join( cwd, 'data_2024/EddyCovarianceData/eng/FBG_fluxes_30min_20240401_20240608_eng.csv' ))
    data2024_GoeWa = pd.read_csv( os.path.join( cwd, 'data_2024/EddyCovarianceData/eng/GoeW_fluxes_30min_20240401_20240608_eng.csv' ) )

    # Select data and labels

    transform = torchvision.transforms.ToTensor()

    # BoG23_set = EBCDataset( , , transform=transform )
    # BoG24_set = EBCDataset( , , transform=transform )
    # GoeWa23_set = EBCDataset( , , transform=transform )
    # GoeWa24_set = EBCDataset( , , transform=transform )

    # Bog = torch.utils.data.ConcatDataset( [BoG23_set, BoG24_set] )
    # GoeWa = torch.utils.data.ConcatDataset( [GoeWa23_set, GoeWa24_set] )

    return BoG, GoeWa



# dataset Splitter 
def train_val_test_splitter(dataset, split_seed=42, test_frac=0.2, val_frac = 0.2):
    """ Splits given dataset into train, val and test datasets

    Args:
        dataset: the given dataset
        split_seed: the seed used for the rng
        test_frac: fraction of data used for testing
        val_frac_ fraction of training data used for validation
    """
    # Train Test Split
    num_test_samples = np.ceil(test_frac * dataset.data.shape[0]).astype(int)
    num_train_samples = dataset.data.shape[0] - num_test_samples
    trainset, testset = torch.utils.data.random_split(dataset, 
                                                    (num_train_samples, num_test_samples), 
                                                    generator=torch.Generator().manual_seed(split_seed))
    
    # Train Val Split
    num_val_samples = np.ceil(val_frac * trainset.data.shape[0]).astype(int)
    num_train_samples = trainset.data.shape[0] - num_val_samples
    trainset, valset = torch.utils.data.random_split(trainset, 
                                                    (num_train_samples, num_val_samples), 
                                                    generator=torch.Generator().manual_seed(split_seed))
    
    return trainset, valset, testset



# Dataloaders
def data_loaders(trainset, valset, testset, batch_size=64, num_cpus=1):
    """Initialize train, validation and test data loader.

    Args:
        trainset: Training set torchvision dataset object.
        valset: Validation set torchvision dataset object.
        testset: Test set torchvision dataset object.
        batch_size: Batchsize used during training, defaults to 64
        num_cpus: Number of CPUs to use when iterating over
            the data loader. More is faster. Defaults to 1.
    """        
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=num_cpus)
    valloader = torch.utils.data.DataLoader(valset, 
                                            batch_size=batch_size,
                                            shuffle=True,
                                            num_workers=num_cpus)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=batch_size,
                                             shuffle=True, 
                                             num_workers=num_cpus)
    return trainloader, valloader, testloader

In [15]:
############# TRAINING FUNCTIONS ###############

# Define validation metric
def prediction_error(y, y_pred): 
    return abs(y - y_pred)




def train(dataloader, optimizer, model, master_bar, loss_fn = nn.MSELoss()):
    """Run one training epoch.

    Args:
        dataloade: dataloader containing trainingdata
        optimizer: Torch optimizer object
        model: the model that is trained
        loss_fn: the loss function to be used -> nn.MSELoss()
        master_bar: Will be iterated over for each
            epoch to draw batches and display training progress

    Returns:
        Mean epoch loss and accuracy
    """
    loss = []
    total_prediction_error = 0

    for x, y in fastprogress.progress_bar(dataloader, parent=master_bar):
        # Reset optimmizers
        optimizer.zero_grad()
        model.train()

        # Forward pass
        y_pred = model(x)

        # For calculating the prediction error, add the distance between y and y_pred
        # to the total error
        total_prediction_error += prediction_error(y, y_pred)

        # Compute loss
        loss = loss_fn(y_pred, y)

        # Backward pass
        loss.backward()
        optimizer.step()

        # For plotting the train loss, save it for each sample
        loss.append(loss.item())

    # Return the mean loss and the accuracy of this epoch
    return np.mean(loss), total_prediction_error





def validate(dataloader, model, master_bar, loss_fn=nn.MSELoss()):
    """Compute loss and total prediction error on validation set.

    Args:
        dataloader: dataloader containing validation data
        model (nn.Module): the model to train
        loss_fn: the loss function to be used, defaults to MSELoss
        master_bar (fastprogress.master_bar): Will be iterated over to draw 
            batches and show validation progress

    Returns:
        Mean loss and total prediction error on validation set
    """
    epoch_loss = []
    total_prediction_error = 0  

    model.eval()
    with torch.no_grad():
        for x, y in fastprogress.progress_bar(dataloader, parent=master_bar):
            # make a prediction on validation set
            y_pred = model(x)

            # For calculating the prediction error, add the distance between y and y_pred
            # to the total error
            total_prediction_error += prediction_error(y, y_pred)

            # Compute loss
            loss = loss_fn(y_pred, y)

            # For plotting the train loss, save it for each sample
            epoch_loss.append(loss.item())

    # Return the mean loss, the accuracy and the confusion matrix
    return np.mean(epoch_loss), total_prediction_error





def plot(title, label, train_results, val_results, yscale='linear', save_path=None):
    """Plot learning curves.

    Args:
        title: Title of plot
        label: y-axis label
        train_results: Vector containing training results over epochs
        val_results: vector containing validation results over epochs
        yscale: Defines how the y-axis scales
        save_path: Optional path for saving file
    """
    
    epochs = np.arange(len(train_results)) + 1
    
    sns.set(style='ticks')

    plt.plot(epochs, train_results, epochs, val_results, linestyle='dashed', marker='o')
    legend = ['Train results', 'Validation results']
        
    plt.legend(legend)
    plt.xlabel('Epoch')
    plt.ylabel(label)
    plt.yscale(yscale)
    plt.title(title)
    
    sns.despine(trim=True, offset=5)
    plt.title(title, fontsize=15)
    if save_path:
        plt.savefig(str(os.path.join( save_path , label+".png")), bbox_inches='tight')
    plt.show()







def run_training(model, optimizer, num_epochs, train_dataloader, val_dataloader, 
                 loss_fn=nn.MSELoss(), verbose=False):
    """Run model training.

    Args:
        model: The model to be trained
        optimizer: The optimizer used during training
        loss_fn: Torch loss function for training -> nn.MSELoss()
        num_epochs: How many epochs the model is trained for
        train_dataloader:  dataloader containing training data
        val_dataloader: dataloader containing validation data
        verbose: Whether to print information on training progress

    Returns:
        lists containing  losses and total prediction errors per epoch for training and validation
    """
    start_time = time.time()
    master_bar = fastprogress.master_bar(range(num_epochs))
    train_losses, val_losses, train_tpes, val_tpes = [],[],[],[]

    for epoch in master_bar:
        # Train the model
        epoch_train_loss, epoch_train_tpe = train(train_dataloader, optimizer, model, 
                                                  loss_fn, master_bar)
        # Validate the model
        epoch_val_loss, epoch_val_tpe = validate(val_dataloader, model, loss_fn, master_bar)

        # Save loss and acc for plotting
        train_losses.append(epoch_train_loss)
        val_losses.append(epoch_val_loss)
        train_tpes.append(epoch_train_tpe)
        val_tpes.append(epoch_val_tpe)
        
        if verbose:
            master_bar.write(f'Train loss: {epoch_train_loss:.2f}, val loss: {epoch_val_loss:.2f}, train acc: {epoch_train_tpe:.3f}, val acc {epoch_val_tpe:.3f}')

    time_elapsed = np.round(time.time() - start_time, 0).astype(int)
    print(f'Finished training after {time_elapsed} seconds.')

    plot("Loss", "Loss", train_losses, val_losses)
    plot("TPE", "TPE", train_tpes, val_tpes)

    return train_losses, val_losses, train_tpes, val_tpes