In [1]:
import pandas as pd
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader

class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1): #3Frames per second, max amount frames = window_length*3
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = window_length * 3
        self.data = self._synchronize()
        #step всегда пол окна, кроме случая с 1 сек, потому что аудио извлечено с шагом в секунду и нет смысла брать шаг 1.5 для шага модальностей

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        return path.split('/')[-1].split('_')[0]
        
    def _synchronize(self):
        synchronized_data = []
        video_ids = self.video_df['path'].apply(self._extract_video_id).unique() #retrieves unique video IDs from the video dataframe's file paths.
        
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]#test
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            """
            - window of time defined by start_time and end_time (where end_time = start_time + self.window_length)
            - video segments are selected based on their timestamps being within this window. Audio segments are chosen if their start and end timesteps fall within the window
            - if selected video segment has less frames than self.max_frames, additional rows (copies of the last row) are appended to match self.max_frames
            - labels for arousal and valence are taken from the last row of the windowed video data
            """
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)] #here smth can be wrong, check
                #в большинстве случаев должно быть 6 фреймов в окне
                # shape video , shape audio, timestep audio/video, labels
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)] #print timesteps

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)
                
                #assert len(window_video_data)==int(self.window_length*3) вылетает ошибка сразу если не тру

                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 4:].values #убрать arousal valence из фич, потому что они в лейблах
                    audio_features = window_audio_data.iloc[:, 6:].values #выбросить пустую колонку features
                    #распечатать все что сформировала 
                    # какие у видео и аудио начало и конец таймстеп, сколько кадров и какие лейблы, первые 10 итераций

                    # Here: extract video_name and timestamp for each sample
                    video_name = video_id
                    timestamp = window_video_data.iloc[-1]['timestamp']

                    synchronized_data.append((video_features, audio_features, labels, video_name, timestamp))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data # contains tuples of synchronized video and audio features along with their labels, names, and timestamps

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves a single item from the dataset by index. It ensures that video and audio features are converted to tensors of type float32 and
        labels are processed to ensure they're in a consistent format before returning them along with the video name and timestamp.
        """
        video_features, audio_features, labels, video_name, timestamp = self.data[idx]
    
        # Ensure labels are in a consistent format
        labels = np.array(labels, dtype=np.float32)

        video_features_tensor = torch.tensor(video_features, dtype=torch.float32)
        audio_features_tensor = torch.tensor(audio_features, dtype=torch.float32)
        labels_tensor = torch.tensor(labels, dtype=torch.float32)

        print(f"Video features type: {type(video_features)}, shape: {video_features.shape}")
        print(f"Audio features type: {type(audio_features)}, shape: {audio_features.shape}")
        print(f"Labels type: {type(labels)}, shape: {labels.shape}")

        return video_features_tensor, audio_features_tensor, labels_tensor, video_name, timestamp


# Load video and audio data
video_train_df = pd.read_csv('SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) #убраить шафл и проверить вручную! /changed to False..
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Assuming you have your DataLoader setup as before
for i, (video_features, audio_features, labels, video_name, timestamp) in enumerate(train_loader):
    if i >= 10:  # Just to limit the output to the first 10 batches
        break
    print(f"Batch {i+1}")
    print(f"Video Name: {video_name}, Timestamp: {timestamp}")    
    # Convert tensors to numpy for easy slicing and displaying
    video_features_np = video_features.numpy()
    audio_features_np = audio_features.numpy()
    labels_np = labels.numpy()
    
    # Print the first few rows of each. Adjust the number of rows as needed
    num_rows_to_display = 3  # for example, to display the first 3 rows
    
    print(f"Batch {i+1}")
    print(f"Video Features (first {num_rows_to_display} rows):")
    print(video_features_np[:num_rows_to_display])
    
    print(f"Audio Features (first {num_rows_to_display} rows):")
    # Note: If the second dimension represents a time or sequence dimension, you may need to adjust this
    print(audio_features_np[:, :num_rows_to_display])  # Assuming the first dimension is batch
    
    print("Labels:")
    print(labels_np)  # Labels might be just one per sample, depending on your data structure
    print("\n----------\n")
    # why "shape: (384, 257) " 384 не может быть

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Video features type: <class 'numpy.ndarray'>, shape: (6, 256)
Audio features type: <class 'numpy.ndarray'>, shape: (2, 768)
Labels type: <class 'numpy.ndarray'>, shape: (2,)
Batch 1
Video Name: ('SEW1101',), Timestamp: tensor([2.7200], dtype=torch.float64)
Batch 1
Video Features (first 3 rows):
[[[ 9.2403534e-05 -6.9705996e-04  7.8170612e-02 ...  3.2085361e-05
   -6.8727362e-01 -5.8516918e-04]
  [ 9.6840100e-05 -7.8241242e-04  2.5183462e-02 ...  6.2751940e-05
   -8.3312464e-01 -7.5931585e-04]
  [ 9.1237474e-05 -8.1487821e-04  1.0307739e-02 ...  7.0110924e-05
   -7.6311010e-01 -7.3544384e-04]
  [ 9.1237474e-05 -8.1487821e-04  1.0307739e-02 ...  7.0110924e-05
   -7.6311010e-01 -7.3544384e-04]
  [ 9.1237474e-05 -8.1487821e-04  1.0307739e-02 ...  7.0110924e-05
   -7.6311010e-01 -7.3544384e-04]
  [ 9.1237474e-05 -8.1487821e-04  1.0307739e-02 ...  7.0110924e-05
   -7.6311010e-01 -7.3544384e-04]]]
Audio Features (first 3 rows):
[[[-0.0765605   0.01174777  0.03130768 ...  0.0145654  -0.0082252

In [8]:
import pandas as pd
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader

class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None): #3Frames per second, max amount frames = window_length*3
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()
        #step всегда пол окна, кроме случая с 1 сек, потому что аудио извлечено с шагом в секунду и нет смысла брать шаг 1.5 для шага модальностей

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        return path.split('/')[-1].split('_')[0]
        
    def _synchronize(self):
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames() #ensures that all video segments in the dataset have a uniform length.

        video_ids = self.video_df['path'].apply(self._extract_video_id).unique() #retrieves unique video IDs from the video dataframe's file paths.
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]#test
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            """
            - window of time defined by start_time and end_time (where end_time = start_time + self.window_length)
            - video segments are selected based on their timestamps being within this window. Audio segments are chosen if their start and end timesteps fall within the window
            - if selected video segment has less frames than self.max_frames, additional rows (copies of the last row) are appended to match self.max_frames
            - labels for arousal and valence are taken from the last row of the windowed video data
            """
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)] #here smth can be wrong, check
                #в большинстве случаев должно быть 6 фреймов в окне
                # shape video , shape audio, timestep audio/video, labels
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)] #print timesteps

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)
                
                #assert len(window_video_data)==int(self.window_length*3) вылетает ошибка сразу если не тру

                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values #убрать arousal valence из фич, потому что они в лейблах
                    audio_features = window_audio_data.iloc[:, 5:].values #выбросить пустую колонку features
                    #распечатать все что сформировала 
                    # какие у видео и аудио начало и конец таймстеп, сколько кадров и какие лейблы, первые 10 итераций

                    # Here: extract video_name and timestamp for each sample
                    video_name = video_id
                    timestamp = window_video_data.iloc[-1]['timestamp']

                    synchronized_data.append((video_features, audio_features, labels, video_name, timestamp))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data # contains tuples of synchronized video and audio features along with their labels, names, and timestamps


    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves a single item from the dataset by index. It ensures that video and audio features are converted to tensors of type float32 and
        labels are processed to ensure they're in a consistent format before returning them along with the video name and timestamp.
        """
        video_features, audio_features, labels, video_name, timestamp = self.data[idx]
    
        # Ensure labels are in a consistent format
        labels = np.array(labels, dtype=np.float32)

        video_features_tensor = torch.tensor(video_features, dtype=torch.float32)
        audio_features_tensor = torch.tensor(audio_features, dtype=torch.float32)
        labels_tensor = torch.tensor(labels, dtype=torch.float32)

        print(f"Video features type: {type(video_features)}, shape: {video_features.shape}")
        print(f"Audio features type: {type(audio_features)}, shape: {audio_features.shape}")
        print(f"Labels type: {type(labels)}, shape: {labels.shape}")

        return video_features_tensor, audio_features_tensor, labels_tensor, video_name, timestamp


# Load video and audio data
video_train_df = pd.read_csv('SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) #убраить шафл и проверить вручную!
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Assuming you have your DataLoader setup as before
for i, (video_features, audio_features, labels, video_name, timestamp) in enumerate(train_loader):
    if i >= 10:  # Just to limit the output to the first 10 batches
        break
    print(f"Batch {i+1}")
    print(f"Video Name: {video_name}, Timestamp: {timestamp}")    
    # Convert tensors to numpy for easy slicing and displaying
    video_features_np = video_features.numpy()
    audio_features_np = audio_features.numpy()
    labels_np = labels.numpy()
    
    # Print the first few rows of each. Adjust the number of rows as needed
    num_rows_to_display = 3  # for example, to display the first 3 rows
    
    print(f"Batch {i+1}")
    print(f"Video Features (first {num_rows_to_display} rows):")
    print(video_features_np[:num_rows_to_display])
    
    print(f"Audio Features (first {num_rows_to_display} rows):")
    # Note: If the second dimension represents a time or sequence dimension, you may need to adjust this
    print(audio_features_np[:, :num_rows_to_display])  # Assuming the first dimension is batch
    
    print("Labels:")
    print(labels_np)  # Labels might be just one per sample, depending on your data structure
    print("\n----------\n")
    # why "shape: (384, 257) " 384 не может быть

Video features type: <class 'numpy.ndarray'>, shape: (384, 257)
Audio features type: <class 'numpy.ndarray'>, shape: (2, 769)
Labels type: <class 'numpy.ndarray'>, shape: (2,)
Batch 1
Video Name: ('SEW1117',), Timestamp: tensor([115.9400], dtype=torch.float64)
Batch 1
Video Features (first 3 rows):
[[[ 5.0885999e-03  8.2287894e-05 -1.3206836e-03 ...  4.3610100e-05
   -9.7518128e-01 -5.7298515e-04]
  [ 0.0000000e+00  8.0436737e-05 -1.3107845e-03 ...  4.6995563e-05
   -9.6503872e-01 -6.5678888e-04]
  [ 0.0000000e+00  7.4407202e-05 -1.1705542e-03 ...  7.4300915e-05
   -9.2931557e-01 -7.4286928e-04]
  ...
  [ 0.0000000e+00  7.9427788e-05 -1.2892334e-03 ...  8.6821208e-05
   -9.7242546e-01 -7.6293014e-04]
  [ 0.0000000e+00  7.9427788e-05 -1.2892334e-03 ...  8.6821208e-05
   -9.7242546e-01 -7.6293014e-04]
  [ 0.0000000e+00  7.9427788e-05 -1.2892334e-03 ...  8.6821208e-05
   -9.7242546e-01 -7.6293014e-04]]]
Audio Features (first 3 rows):
[[[        nan -0.12013236  0.00393401 ... -0.011756   

In [2]:
import pandas as pd
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader

# Adjusted class to consider the path difference
class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None):
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        # Adjust the slicing as per your directory structure if needed.
        return path.split('/')[-1].split('_')[0]

    def _synchronize(self):
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames()

        video_ids = self.video_df['path'].apply(self._extract_video_id).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)]

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)
                    
                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values
                    audio_features = window_audio_data.iloc[:, 5:].values
                    
                    synchronized_data.append((video_features, audio_features, labels))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data

    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_features, audio_features, labels = self.data[idx]
    
        print(f"Video features type: {type(video_features)}, shape: {video_features.shape}")
        print(f"Audio features type: {type(audio_features)}, shape: {audio_features.shape}")
        print(f"Labels type: {type(labels)}, shape: {labels.shape}")

        # Ensure video_features and audio_features are numpy arrays of type float
        video_features = np.array(video_features, dtype=np.float32)
        audio_features = np.array(audio_features, dtype=np.float32)
    
        # Convert labels to numpy array if it's not already
        labels = np.array(labels, dtype=np.float32)
    
        return torch.tensor(video_features, dtype=torch.float), torch.tensor(audio_features, dtype=torch.float), torch.tensor(labels, dtype=torch.float)


# Load video and audio data
video_train_df = pd.read_csv('SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display the first 10 entries from the train_loader
for i, (video_features, audio_features, labels) in enumerate(train_loader):
    if i >= 10:
        break
    print(f"Sample {i+1}: Video features shape: {video_features.shape}, Audio features shape: {audio_features.shape}, Labels: {labels.numpy()}")


Video features type: <class 'numpy.ndarray'>, shape: (384, 257)
Audio features type: <class 'numpy.ndarray'>, shape: (2, 769)
Labels type: <class 'numpy.ndarray'>, shape: (2,)
Sample 1: Video features shape: torch.Size([1, 384, 257]), Audio features shape: torch.Size([1, 2, 769]), Labels: [[ 0.0877588 -0.0935914]]
Video features type: <class 'numpy.ndarray'>, shape: (384, 257)
Audio features type: <class 'numpy.ndarray'>, shape: (2, 769)
Labels type: <class 'numpy.ndarray'>, shape: (2,)
Sample 2: Video features shape: torch.Size([1, 384, 257]), Audio features shape: torch.Size([1, 2, 769]), Labels: [[0.2031382 0.0109242]]
Video features type: <class 'numpy.ndarray'>, shape: (384, 257)
Audio features type: <class 'numpy.ndarray'>, shape: (2, 769)
Labels type: <class 'numpy.ndarray'>, shape: (2,)
Sample 3: Video features shape: torch.Size([1, 384, 257]), Audio features shape: torch.Size([1, 2, 769]), Labels: [[0.1340938 0.0669474]]
Video features type: <class 'numpy.ndarray'>, shape: (38

In [1]:
import pandas as pd
import numpy as np 
import torch
from torch.utils.data import Dataset, DataLoader

# Adjusted class to consider the path difference
class SynchronizedAudioVideoDataset(Dataset):
    def __init__(self, video_df, audio_df, window_length=2, step=1, max_frames=None):
        self.video_df = video_df
        self.audio_df = audio_df
        self.window_length = window_length
        self.step = step
        self.max_frames = max_frames
        self.data = self._synchronize()

    def _extract_video_id(self, path):
        # This function extracts the video ID from the video file path.
        # Adjust the slicing as per your directory structure if needed.
        return path.split('/')[-1].split('_')[0]

    def _synchronize(self):
        synchronized_data = []
        if self.max_frames is None:
            self.max_frames = self._calculate_max_frames()

        video_ids = self.video_df['path'].apply(self._extract_video_id).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].apply(lambda x: self._extract_video_id(x) == video_id)]
            audio_data = self.audio_df[self.audio_df['filename'].str.contains(video_id)]

            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                window_audio_data = audio_data[(audio_data['start_timestep'] >= start_time) & (audio_data['end_timestep'] <= end_time)]

                if len(window_video_data) > 0 and len(window_audio_data) > 0:
                    if len(window_video_data) < self.max_frames:
                        additional_rows = self.max_frames - len(window_video_data)
                        last_row = window_video_data.iloc[-1:].copy()
                        for _ in range(additional_rows):
                            window_video_data = pd.concat([window_video_data, last_row], ignore_index=True)
                    
                    labels = window_video_data.iloc[-1][['arousal', 'valence']].values
                    video_features = window_video_data.iloc[:, 3:].values
                    audio_features = window_audio_data.iloc[:, 5:].values
                    
                    synchronized_data.append((video_features, audio_features, labels))

                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break

        return synchronized_data

    def _calculate_max_frames(self):
        """
        Calculates the maximum number of frames within any window across all videos.
        :return: Maximum number of frames within a window.
        """
        max_frames = 0
        video_ids = self.video_df['path'].apply(lambda x: x.split('/')[-2]).unique()
        for video_id in video_ids:
            video_data = self.video_df[self.video_df['path'].str.contains(video_id)]
            start_time = 0
            while True:
                end_time = start_time + self.window_length
                window_video_data = video_data[(video_data['timestamp'] >= start_time) & (video_data['timestamp'] < end_time)]
                max_frames = max(max_frames, len(window_video_data))
                start_time += self.step
                if start_time + self.window_length > video_data['timestamp'].max():
                    break
        return max_frames

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_features, audio_features, labels = self.data[idx]
    
        print(f"Video features type: {type(video_features)}, shape: {video_features.shape}")
        print(f"Audio features type: {type(audio_features)}, shape: {audio_features.shape}")
        print(f"Labels type: {type(labels)}, shape: {labels.shape}")

        # Ensure video_features and audio_features are numpy arrays of type float
        video_features = np.array(video_features, dtype=np.float32)
        audio_features = np.array(audio_features, dtype=np.float32)
    
        # Convert labels to numpy array if it's not already
        labels = np.array(labels, dtype=np.float32)
    
        return torch.tensor(video_features, dtype=torch.float), torch.tensor(audio_features, dtype=torch.float), torch.tensor(labels, dtype=torch.float)


# Load video and audio data
video_train_df = pd.read_csv('SEWA_radiant_fog_160_train.csv')
video_dev_df = pd.read_csv('SEWA_radiant_fog_160_dev.csv')
video_test_df = pd.read_csv('SEWA_radiant_fog_160_test.csv')

audio_train_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_train.csv')
audio_dev_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_dev.csv')
audio_test_df = pd.read_csv('1sec/SEWA_features_wav2vec_1_seconds_test.csv')

# Create dataset instances
train_dataset = SynchronizedAudioVideoDataset(video_train_df, audio_train_df)
dev_dataset = SynchronizedAudioVideoDataset(video_dev_df, audio_dev_df)
test_dataset = SynchronizedAudioVideoDataset(video_test_df, audio_test_df)

# Create DataLoader instances
batch_size = 1  # For demonstration
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Assuming you have your DataLoader setup as before
for i, (video_features, audio_features, labels) in enumerate(train_loader):
    if i >= 10:  # Just to limit the output to the first 10 batches
        break
    
    # Convert tensors to numpy for easy slicing and displaying
    video_features_np = video_features.numpy()
    audio_features_np = audio_features.numpy()
    labels_np = labels.numpy()
    
    # Print the first few rows of each. Adjust the number of rows as needed
    num_rows_to_display = 3  # for example, to display the first 3 rows
    
    print(f"Batch {i+1}")
    print(f"Video Features (first {num_rows_to_display} rows):")
    print(video_features_np[:num_rows_to_display])
    
    print(f"Audio Features (first {num_rows_to_display} rows):")
    # Note: If the second dimension represents a time or sequence dimension, you may need to adjust this
    print(audio_features_np[:, :num_rows_to_display])  # Assuming the first dimension is batch
    
    print("Labels:")
    print(labels_np)  # Labels might be just one per sample, depending on your data structure
    print("\n----------\n")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Video features type: <class 'numpy.ndarray'>, shape: (384, 257)
Audio features type: <class 'numpy.ndarray'>, shape: (2, 769)
Labels type: <class 'numpy.ndarray'>, shape: (2,)
Batch 1
Video Features (first 3 rows):
[[[ 2.4889860e-01  8.8315610e-05 -6.2006363e-04 ...  3.9475199e-05
   -9.8073441e-01 -3.2977949e-04]
  [ 2.2260000e-01  9.4982752e-05 -9.3289296e-04 ...  2.4347770e-05
   -9.8099303e-01 -5.2966579e-04]
  [ 1.8751600e-01  8.8167995e-05 -8.6004921e-04 ...  4.7251844e-05
   -9.7213334e-01 -5.1216752e-04]
  ...
  [ 9.9471398e-02  7.8380086e-05 -9.7199652e-04 ...  6.0424798e-05
   -9.1462880e-01 -5.3763820e-04]
  [ 9.9471398e-02  7.8380086e-05 -9.7199652e-04 ...  6.0424798e-05
   -9.1462880e-01 -5.3763820e-04]
  [ 9.9471398e-02  7.8380086e-05 -9.7199652e-04 ...  6.0424798e-05
   -9.1462880e-01 -5.3763820e-04]]]
Audio Features (first 3 rows):
[[[        nan -0.02999436 -0.01172482 ... -0.09862999 -0.0093683
   -0.13289516]
  [        nan -0.0118871  -0.01024382 ... -0.17639372  0.

In [2]:
import pandas as pd

# Function to load a dataset
def load_dataset(audio_path, video_path):
    audio_df = pd.read_csv(audio_path)
    video_df = pd.read_csv(video_path)
    # Ensure the timestamp is rounded for proper matching
    video_df['matched_audio_start'] = video_df['timestamp'].apply(lambda x: int(x))
    return audio_df, video_df

# Function to ensure data types match and prevent many-to-many joins
def synchronize_and_merge(audio_df, video_df):
    # Convert timestamps in both dataframes to a consistent format and type
    audio_df['start_timestep'] = audio_df['start_timestep'].apply(lambda x: round(x, 2))
    video_df['matched_audio_start'] = video_df['timestamp'].apply(lambda x: round(x, 2))

    # Ensure both columns are of the same type
    audio_df['start_timestep'] = audio_df['start_timestep'].astype(float)
    video_df['matched_audio_start'] = video_df['matched_audio_start'].astype(float)

    # Merge using an inner join to ensure only matching rows are included
    merged_df = pd.merge(audio_df, video_df, left_on='start_timestep', right_on='matched_audio_start', how='inner')

    return merged_df

# Paths to your datasets
dataset_paths = {
    "train": ("1sec/SEWA_features_wav2vec_1_seconds_train.csv", "SEWA_radiant_fog_160_train.csv"),
    "dev": ("1sec/SEWA_features_wav2vec_1_seconds_dev.csv", "SEWA_radiant_fog_160_dev.csv"),
    "test": ("1sec/SEWA_features_wav2vec_1_seconds_test.csv", "SEWA_radiant_fog_160_test.csv")
}

# Process and save each dataset
for set_name, (audio_path, video_path) in dataset_paths.items():
    audio_df, video_df = load_dataset(audio_path, video_path)
    merged_df = synchronize_and_merge(audio_df, video_df)
    # Save the synchronized and merged dataset
    merged_df.to_csv(f'SynchronizedData-Fusion/synchronized_{set_name}.csv', index=False)

In [None]:
import pandas as pd

def synchronize_audio_video(audio_df, video_df):
    # Round video timestamps for easier matching
    video_df['rounded_timestamp'] = video_df['timestamp'].round()
    
    # Prepare the audio dataframe by creating a key for easier merging
    audio_df['merge_key'] = audio_df['start_timestep'].astype(int)
    
    # Merge based on the rounded timestamp and the new audio merge key
    merged_df = pd.merge(video_df, audio_df, left_on='rounded_timestamp', right_on='merge_key', how='inner')
    
    # Drop unnecessary columns if needed
    merged_df.drop(['rounded_timestamp', 'merge_key'], axis=1, inplace=True)
    
    return merged_df

# Paths to your datasets
audio_train_path = '1sec/SEWA_features_wav2vec_1_seconds_train.csv'
audio_dev_path = '1sec/SEWA_features_wav2vec_1_seconds_dev.csv'
audio_test_path = '1sec/SEWA_features_wav2vec_1_seconds_test.csv'

video_train_path = 'SEWA_radiant_fog_160_train.csv'
video_dev_path = 'SEWA_radiant_fog_160_dev.csv'
video_test_path = 'SEWA_radiant_fog_160_test.csv'

# Load the datasets
audio_train = pd.read_csv(audio_train_path)
audio_dev = pd.read_csv(audio_dev_path)
audio_test = pd.read_csv(audio_test_path)

video_train = pd.read_csv(video_train_path)
video_dev = pd.read_csv(video_dev_path)
video_test = pd.read_csv(video_test_path)

# Synchronize and merge the datasets
merged_train = synchronize_audio_video(audio_train, video_train)
merged_dev = synchronize_audio_video(audio_dev, video_dev)
merged_test = synchronize_audio_video(audio_test, video_test)

# Save the synchronized and merged datasets
merged_train.to_csv('SynchronizedData-Fusion/merged_train.csv', index=False)
merged_dev.to_csv('SynchronizedData-Fusion/merged_dev.csv', index=False)
merged_test.to_csv('SynchronizedData-Fusion/merged_test.csv', index=False)

print("Synchronization and merging complete. Files saved.")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
