In [11]:
# important  imports
import os
import time

import numpy as np
import pandas as pd

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [12]:
# Get number of cpus to use for faster parallelized data loading
num_cpus = os.cpu_count()
print(num_cpus, 'CPUs available')

16 CPUs available


In [13]:
# Define Dataset
class EBCDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample, label



# Data loader
def grab_data(num_cpus=1):
    """Loads data from data_dir

    Args:
        data_dir (str): Directory to store data
        num_cpus (int, optional): Number of cpus that should be used to 
            preprocess data. Defaults to 1.

    Returns:
        Returns datasets as Dataset class for Göttingen forest and Bothanic Garden
    """
    # Load the data from 2023 and 2024 into pandas
    cwd = os.getcwd()

    data2023_BoG = pd.read_csv(os.path.join( cwd, 'data_2023/Fluxes_H_LE_CO2/BoG/FBG_fluxes_30min_20230101_20230801.csv' ))
    data2023_GoeWa = pd.read_csv(os.path.join( cwd, 'data_2023/Fluxes_H_LE_CO2/GoeWa/GoeW_fluxes_30min_20230101_20230801.csv' ))
    data2024_BoG = pd.read_csv(os.path.join( cwd, 'data_2024/EddyCovarianceData/eng/FBG_fluxes_30min_20240401_20240608_eng.csv' ))
    data2024_GoeWa = pd.read_csv( os.path.join( cwd, 'data_2024/EddyCovarianceData/eng/GoeW_fluxes_30min_20240401_20240608_eng.csv' ) )

    # Select data and labels

    transform = torchvision.transforms.ToTensor()

    # BoG23_set = EBCDataset( , , transform=transform )
    # BoG24_set = EBCDataset( , , transform=transform )
    # GoeWa23_set = EBCDataset( , , transform=transform )
    # GoeWa24_set = EBCDataset( , , transform=transform )

    # Bog = torch.utils.data.ConcatDataset( [BoG23_set, BoG24_set] )
    # GoeWa = torch.utils.data.ConcatDataset( [GoeWa23_set, GoeWa24_set] )

    return BoG, GoeWa



# dataset Splitter 
def train_val_test_splitter(dataset, split_seed=42, test_frac=0.2, val_frac = 0.2):
    """ Splits given dataset into train, val and test datasets

    Args:
        dataset: the given dataset
        split_seed: the seed used for the rng
        test_frac: fraction of data used for testing
        val_frac_ fraction of training data used for validation
    """
    # Train Test Split
    num_test_samples = np.ceil(test_frac * dataset.data.shape[0]).astype(int)
    num_train_samples = dataset.data.shape[0] - num_test_samples
    trainset, testset = torch.utils.data.random_split(dataset, 
                                                    (num_train_samples, num_test_samples), 
                                                    generator=torch.Generator().manual_seed(split_seed))
    
    # Train Val Split
    num_val_samples = np.ceil(val_frac * trainset.data.shape[0]).astype(int)
    num_train_samples = trainset.data.shape[0] - num_val_samples
    trainset, valset = torch.utils.data.random_split(trainset, 
                                                    (num_train_samples, num_val_samples), 
                                                    generator=torch.Generator().manual_seed(split_seed))
    
    return trainset, valset, testset
