In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

# Load datasets
def load_datasets():
    audio_train_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_train.csv')
    audio_dev_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
    audio_test_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_test.csv')

    video_train_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_train.csv')
    video_dev_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_dev.csv')
    video_test_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_test.csv')

    return audio_train_df, audio_dev_df, audio_test_df, video_train_df, video_dev_df, video_test_df

# Define custom Dataset
class SynchronizedAVDataset(Dataset):
    def __init__(self, audio_df, video_df, window_size_sec=2, stride_sec=1, max_video_frames=6):
        self.audio_df = audio_df
        self.video_df = video_df
        self.window_size_sec = window_size_sec
        self.stride_sec = stride_sec
        self.max_video_frames = max_video_frames
        self.windows = self.prepare_windows()

    def __len__(self):
        return len(self.windows)

    def __getitem__(self, idx):
        window_info = self.windows[idx]
        audio_features, video_features, labels = self.get_window_data(window_info)
        return (audio_features, video_features), labels

    def prepare_windows(self):
        windows = []
        for video_id, video_group in self.video_df.groupby('path'):
            audio_group = self.audio_df[self.audio_df['filename'].str.contains(video_id)]
            max_time = video_group['timestamp'].max()
            window_starts = np.arange(0, max_time - self.window_size_sec + 1, self.stride_sec)

            for start in window_starts:
                end = start + self.window_size_sec
                window_audio = audio_group[(audio_group['start_timestep'] >= start) & (audio_group['end_timestep'] <= end)]
                window_video = video_group[(video_group['timestamp'] >= start) & (video_group['timestamp'] < end)]

                num_frames = len(window_video)
                if num_frames >= 2:  # Adjust this condition based on your requirement
                    # Adjust the window to include frames up to the max_video_frames
                    window_video = self.adjust_video_frames(window_video)
                    windows.append({'video_id': video_id, 'start': start, 'end': end})
                else:
                    print(f"Window rejected for video {video_id} at start={start} due to insufficient video frames.")

        return windows

    def adjust_video_frames(self, window_video):
        num_frames = len(window_video)
        if num_frames == self.max_video_frames:
            return window_video
        elif num_frames < self.max_video_frames:
            # Duplicate the last frame to reach max_video_frames
            num_duplicates = self.max_video_frames - num_frames
            duplicated_frames = pd.concat([window_video.iloc[-1]] * num_duplicates, ignore_index=True)
            window_video = pd.concat([window_video, duplicated_frames])
            return window_video

    def get_window_data(self, window_info):
        video_id = window_info['video_id']
        start = window_info['start']
        end = window_info['end']

        window_audio = self.audio_df[(self.audio_df['filename'].str.contains(video_id)) & (self.audio_df['start_timestep'] >= start) & (self.audio_df['end_timestep'] <= end)]
        window_video = self.video_df[(self.video_df['path'] == video_id) & (self.video_df['timestamp'] >= start) & (self.video_df['timestamp'] < end)]

        audio_features = window_audio.iloc[:, 5:].values
        video_features = window_video.iloc[:, 4:].values

        # Use last frame's arousal and valence as labels
        labels = window_video.iloc[-1, 2:4].values

        return audio_features, video_features, labels

# Load datasets
audio_train_df, audio_dev_df, audio_test_df, video_train_df, video_dev_df, video_test_df = load_datasets()

# Initialize DataLoader
custom_dataset = SynchronizedAVDataset(audio_train_df, video_train_df)
dataloader = DataLoader(custom_dataset, batch_size=32, shuffle=True)

# Example: Print first 20 rows of audio and video datasets for video 2117
print("First 20 rows of audio dataset for video 2117:")
print(audio_train_df[audio_train_df['filename'].str.contains('SEW2117')].head(20))
print("\nFirst 20 rows of video dataset for video 2117:")
print(video_train_df[video_train_df['path'].str.contains('SEW2117')].head(20))


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=100.0 due to insufficient video frames.
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=101.0 due to insufficient video frames.
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=102.0 due to insufficient video frames.
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=103.0 due to insufficient video frames.
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=104.0 due to insufficient video frames.
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=105.0 due to insufficient video frames.
Window rejected for video /work/home/dsu/Datasets/SEWA/preprocessed/SEW1101_111_86.png at start=106.0 d

KeyboardInterrupt: 

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None):
        """
        Initializes the dataset object.
        :param video_df: DataFrame containing video data.
        :param audio_df: DataFrame containing audio data.
        :param window_length: Length of the time window in seconds.
        :param step: Step size to move the window in seconds.
        :param max_frames: Maximum number of frames to include in a window. If None, it will be calculated.
        """
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()

    def _synchronize(self):
        """
        Synchronizes audio and video data based on the specified window_length and step.
        :return: List of tuples containing synchronized video and audio data along with labels for each window.
        """
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames()

        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id.split('_')[0])]

            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)]

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        # Repeat the last row to match max_frames
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)

                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values  # Excluding path, timestamp, arousal, valence
                    audio_features = window_audio_data.iloc[:, 5:].values  # Excluding filename, start_timestep, end_timestep, arousal, valence, features

                    synchronized_data.append((video_features, audio_features, labels))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data

    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_features, audio_features, labels = self.data[idx]
        return torch.tensor(video_features, dtype=torch.float), torch.tensor(audio_features, dtype=torch.float), torch.tensor(labels, dtype=torch.float)

# Load video and audio data
video_train_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # Set to 1 for demonstration purposes, can be increased as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display the first 10 entries from the train_loader to verify synchronization
for i, (video_features, audio_features, labels) in enumerate(train_loader):
    if i >= 10:
        break
    print(f"Sample {i+1}: Video features shape: {video_features.shape}, Audio features shape: {audio_features.shape}, Labels: {labels.numpy()}")


                                                path  timestamp   arousal  \
0  /work/home/dsu/Datasets/SEWA/preprocessed/SEW1...       2.04  0.294194   
1  /work/home/dsu/Datasets/SEWA/preprocessed/SEW1...       2.38  0.335762   
2  /work/home/dsu/Datasets/SEWA/preprocessed/SEW1...       2.72  0.395458   
3  /work/home/dsu/Datasets/SEWA/preprocessed/SEW1...       3.06  0.419050   
4  /work/home/dsu/Datasets/SEWA/preprocessed/SEW1...       3.40  0.420003   

    valence     emb_0     emb_1     emb_2     emb_3     emb_4     emb_5  ...  \
0  0.330165  0.000092 -0.000697  0.078171 -0.000050 -0.000146 -0.533671  ...   
1  0.408600  0.000097 -0.000782  0.025183  0.001174  0.000013 -0.032633  ...   
2  0.414567  0.000091 -0.000815  0.010308  0.000997  0.000059 -0.063469  ...   
3  0.415242  0.000097 -0.000853 -0.011648  0.000785  0.000045  0.147162  ...   
4  0.386110  0.000099 -0.000661  0.155788 -0.000450 -0.000002  0.000105  ...   

    emb_246   emb_247   emb_248   emb_249   emb_250   em

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Adjusted class to consider the path difference
class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None):
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        # Adjust the slicing as per your directory structure if needed.
        return path.split('/')[-1].split('_')[0]

    def _synchronize(self):
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames()

        video_ids = self.video_df['path'].apply(self._extract_video_id).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)]

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)

                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values
                    audio_features = window_audio_data.iloc[:, 5:].values

                    synchronized_data.append((video_features, audio_features, labels))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data

    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_features, audio_features, labels = self.data[idx]
        return torch.tensor(video_features, dtype=torch.float), torch.tensor(audio_features, dtype=torch.float), torch.tensor(labels, dtype=torch.float)

# Load video and audio data
video_train_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display the first 10 entries from the train_loader
for i, (video_features, audio_features, labels) in enumerate(train_loader):
    if i >= 10:
        break
    print(f"Sample {i+1}: Video features shape: {video_features.shape}, Audio features shape: {audio_features.shape}, Labels: {labels.numpy()}")


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Adjusted class to consider the path difference
class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None):
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        # Adjust the slicing as per your directory structure if needed.
        return path.split('/')[-1].split('_')[0]

    def _synchronize(self):
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames()

        video_ids = self.video_df['path'].apply(self._extract_video_id).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)]

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)

                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values
                    audio_features = window_audio_data.iloc[:, 5:].values

                    synchronized_data.append((video_features, audio_features, labels))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data

    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_features, audio_features, labels = self.data[idx]

        print(f"Video features type: {type(video_features)}, shape: {video_features.shape}")
        print(f"Audio features type: {type(audio_features)}, shape: {audio_features.shape}")
        print(f"Labels type: {type(labels)}, shape: {labels.shape}")

        # Ensure video_features and audio_features are numpy arrays of type float
        video_features = np.array(video_features, dtype=np.float32)
        audio_features = np.array(audio_features, dtype=np.float32)

        # Convert labels to numpy array if it's not already
        labels = np.array(labels, dtype=np.float32)

        return torch.tensor(video_features, dtype=torch.float), torch.tensor(audio_features, dtype=torch.float), torch.tensor(labels, dtype=torch.float)


# Load video and audio data
video_train_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display the first 10 entries from the train_loader
for i, (video_features, audio_features, labels) in enumerate(train_loader):
    if i >= 10:
        break
    print(f"Sample {i+1}: Video features shape: {video_features.shape}, Audio features shape: {audio_features.shape}, Labels: {labels.numpy()}")


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Adjusted class to consider the path difference
class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None):
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        # Adjust the slicing as per your directory structure if needed.
        return path.split('/')[-1].split('_')[0]

    def _synchronize(self):
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames()

        video_ids = self.video_df['path'].apply(self._extract_video_id).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)]

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)

                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values
                    audio_features = window_audio_data.iloc[:, 5:].values

                    synchronized_data.append((video_features, audio_features, labels))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data

    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_features, audio_features, labels = self.data[idx]

        print(f"Video features type: {type(video_features)}, shape: {video_features.shape}")
        print(f"Audio features type: {type(audio_features)}, shape: {audio_features.shape}")
        print(f"Labels type: {type(labels)}, shape: {labels.shape}")

        # Ensure video_features and audio_features are numpy arrays of type float
        video_features = np.array(video_features, dtype=np.float32)
        audio_features = np.array(audio_features, dtype=np.float32)

        # Convert labels to numpy array if it's not already
        labels = np.array(labels, dtype=np.float32)

        return torch.tensor(video_features, dtype=torch.float), torch.tensor(audio_features, dtype=torch.float), torch.tensor(labels, dtype=torch.float)


# Load video and audio data
video_train_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('/content/drive/MyDrive/SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('/content/drive/MyDrive/1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Assuming you have your DataLoader setup as before
for i, (video_features, audio_features, labels) in enumerate(train_loader):
    if i >= 10:  # Just to limit the output to the first 10 batches
        break

    # Convert tensors to numpy for easy slicing and displaying
    video_features_np = video_features.numpy()
    audio_features_np = audio_features.numpy()
    labels_np = labels.numpy()

    # Print the first few rows of each. Adjust the number of rows as needed
    num_rows_to_display = 3  # for example, to display the first 3 rows

    print(f"Batch {i+1}")
    print(f"Video Features (first {num_rows_to_display} rows):")
    print(video_features_np[:num_rows_to_display])

    print(f"Audio Features (first {num_rows_to_display} rows):")
    # Note: If the second dimension represents a time or sequence dimension, you may need to adjust this
    print(audio_features_np[:, :num_rows_to_display])  # Assuming the first dimension is batch

    print("Labels:")
    print(labels_np)  # Labels might be just one per sample, depending on your data structure
    print("\n----------\n")