In [16]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [17]:
torch.manual_seed(0)

<torch._C.Generator at 0x13850fc90>

In [20]:
class CustomParquetDataset(Dataset):
    def __init__(self, data_folder, transform=None):
        """
        Args:
            data_folder (string): Pfad zum Ordner mit den Parquet-Dateien.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.transform = transform

        # Liste der Parquet-Dateien
        self.file_paths = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.parquet')]
        self.folder_number = self.extract_folder_number(data_folder)
        
        # Lesen der Daten und Kombinieren in einem großen DataFrame
        self.data = pd.concat([pd.read_parquet(file) for file in self.file_paths])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Zeitstempel extrahieren und in einen String konvertieren
        timestamp = str(self.data.index[idx])

        # Daten für den aktuellen Index extrahieren
        sample = self.data.iloc[idx].values  # Werte der Zeile als Array

        sample_tensor = torch.tensor(sample, dtype=torch.float)

        if self.transform:
            sample_tensor = self.transform(sample_tensor)

        return sample_tensor, timestamp, self.folder_number

    @staticmethod
    def extract_folder_number(folder_path):
        """
        Extrahiert die Zahl aus dem Ordnernamen, falls vorhanden.
        """
        folder_name = os.path.basename(folder_path)
        try:
            # Versuch, den Ordnernamen in eine Zahl umzuwandeln
            return int(folder_name)
        except ValueError:
            # Wenn der Ordnernamen keine Zahl ist, geben Sie 0 zurück
            return 0



Loading data:

In [33]:
# Pfad zum 'data' Ordner
data_folder_path = '../../data/raw/ecallisto_ng_unzipped/2'  # Bitte passen Sie diesen Pfad entsprechend an.

# Erstellen Sie eine Instanz Ihres benutzerdefinierten Datensatzes
custom_dataset = CustomParquetDataset(data_folder=data_folder_path)

# Aufteilen des Datensatzes in Trainings- und Testsets
train_size = int(0.8 * len(custom_dataset))
test_size = len(custom_dataset) - train_size
train_dataset, test_dataset = random_split(custom_dataset, [train_size, test_size])

# Erstellen von DataLoaders
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=20)

Get timestamps.
Find the closest one to a full minute

In [34]:
first_batch = next(iter(train_loader))

batch_data, batch_timestamps, batch_folder_numbers = first_batch

batch_timestamps

('2021-10-09 06:46:59.701000',
 '2021-11-02 02:22:10.032000',
 '2021-10-09 06:43:12.670000',
 '2021-10-09 06:41:59.670000',
 '2021-10-09 06:41:24.920000',
 '2021-09-28 06:28:30.927000',
 '2021-09-08 00:09:16.222000',
 '2021-08-28 05:11:07.017000',
 '2021-09-17 04:18:20.583000',
 '2021-10-09 06:36:55.670000',
 '2021-11-01 01:43:32.108000',
 '2021-05-22 03:06:04.473000',
 '2021-11-02 02:24:38.532000',
 '2021-10-09 06:37:43.170000',
 '2021-11-01 01:31:31.858000',
 '2021-10-09 06:50:27.201000',
 '2021-08-28 05:11:46.517000',
 '2021-11-02 02:24:10.782000',
 '2021-10-09 06:51:10.701000',
 '2021-10-09 06:41:18.170000')

Take index of the closest one to full minute and get the data

In [35]:
first_data_in_batch = batch_data[0]
first_timestamp_in_batch = batch_timestamps[0]
first_folder_number_in_batch = batch_folder_numbers[0]

print("First data in batch:", first_data_in_batch)
print("Timestamp of first data:", first_timestamp_in_batch)
print("Folder number of  data:", first_folder_number_in_batch)

First data in batch: tensor([156., 162., 172., 169., 154., 149., 153., 150., 175., 160., 153., 158.,
        149., 154., 155., 151., 153., 153., 161., 180., 157., 153., 153., 151.,
        152., 152., 152., 152., 153., 152., 154., 155., 157., 158., 158., 157.,
        157., 159., 158., 158., 158., 158., 161., 160., 161., 162., 164., 164.,
        164., 164., 165., 165., 166., 167., 168., 170., 170., 168., 167., 167.,
        168., 167., 168., 166., 166., 165., 165., 165., 164., 165., 164., 164.,
        163., 164., 163., 162., 162., 164., 164., 164., 162., 164., 164., 163.,
        162., 161., 162., 162., 163., 163., 163., 163., 163., 163., 162., 161.,
        160., 159., 160., 159., 158., 158., 158., 157., 157., 156., 156., 155.,
        156., 157., 156., 156., 156., 156., 155., 155., 154., 155., 163., 157.,
        158., 157., 156., 156., 154., 152., 153., 153., 156., 153., 153., 155.,
        156., 153., 154., 154., 154., 154., 153., 151., 151., 149., 151., 150.,
        150., 151.,

Get the corresponding parquet file and check values

In [39]:
df = pd.read_parquet("../../data/raw/ecallisto_ng_unzipped/2/australia_assa_02_2021-10-09 06-46-00_2021-10-09 06-47-00_None_None.parquet", engine='auto')
df.tail(2)

Unnamed: 0_level_0,15,15.312999725341797,15.687999725341797,16.062999725341797,16.437999725341797,16.812999725341797,17.187999725341797,17.562999725341797,17.937999725341797,18.312999725341797,...,83.56300354003906,83.93800354003906,84.31300354003906,84.68800354003906,85.06300354003906,85.43800354003906,85.81300354003906,86.18800354003906,86.56300354003906,86.93800354003906
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-09 06:46:59.701,156,162,172,169,154,149,153,150,175,160,...,148,148,147,148,146,145,146,145,143,143
2021-10-09 06:46:59.951,156,163,173,169,157,150,154,151,176,161,...,148,148,148,147,146,146,146,145,143,143


Count all observations (rows) in folder 2

In [40]:
directory_path = '../../data/raw/ecallisto_ng_unzipped/2'

parquet_files = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]

total_rows = 0

for file_name in parquet_files:
    file_path = os.path.join(directory_path, file_name)
    
    df = pd.read_parquet(file_path, engine='auto')
    
    num_rows = len(df)
    
    total_rows += num_rows

print(f"Gesamtanzahl der Zeilen in allen Parquet-Dateien: {total_rows}")


Gesamtanzahl der Zeilen in allen Parquet-Dateien: 22550


and check with number of rows in the dataset

In [41]:
len(custom_dataset)

22550