# Setup

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import glob
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import multiprocessing as mp
import cv2
from scipy.signal import resample


In [None]:
TRAIN_VIDEOS_PATH = "/mnt/d/Thesis/Data/Video/Train"
TEST_VIDEOS_PATH = "/mnt/d/Thesis/Data/Video/Test"

TO_PATH = "/mnt/d/Thesis/Prep"

In [None]:
VIDEO_FPS = 1
AUDIO_FPS = 2000
MAX_SECONDS = 60
TARGET_FRAME_SIZE = (480, 840)
TARGET_N_FRAME = MAX_SECONDS * VIDEO_FPS
TARGET_AUDIO_LENGTH = MAX_SECONDS * AUDIO_FPS


SUB_FRACTION = 0.1 

In [None]:
train_df = pd.read_csv("data/SnapUGC/train_out.txt", sep='\t')
train_df['Set'] = 'train'

test_df = pd.read_csv("data/SnapUGC/test_out.txt", sep='\t')
test_df['Set'] = 'test'

df = pd.concat([train_df, test_df])

display(df)
len(train_df), len(test_df), len(df)

# Only select video with duration 10-60s

In [None]:
df = df[(df['Video_len'] >= 10) & (df['Video_len'] <= 60)]
df

# Normalize order to 0-1

In [None]:
mmscl = MinMaxScaler()

df['ECR'] = mmscl.fit_transform(df['order of ECR'].to_numpy()[:, np.newaxis])[:, 0]
df['NAWP'] = mmscl.fit_transform(df['order of NAWP'].to_numpy()[:, np.newaxis])[:, 0]
df

# Select filtered videos

In [None]:
train_videos = glob.glob(os.path.join(TRAIN_VIDEOS_PATH, "*.mp4"))
test_videos = glob.glob(os.path.join(TEST_VIDEOS_PATH, "*.mp4"))

len(train_videos), len(test_videos)

In [None]:
train_videos = pd.DataFrame({"Id": [s[s.rfind("/")+1:s.rfind(".mp4")] for s in train_videos],
                             "Video": train_videos,
                             "Set": "train"})
test_videos = pd.DataFrame({"Id": [s[s.rfind("/")+1:s.rfind(".mp4")] for s in test_videos],
                             "Video": test_videos,
                             "Set": "test"})

videos_df = pd.concat([train_videos, test_videos])
videos_df

In [None]:
df = pd.merge(df, videos_df, how='inner', on=['Id', 'Set'])
df.set_index('Id', drop=True, inplace=True)
df

# Data exploration

In [None]:
plt.hist(df['NAWP'], density=True, histtype='step')
plt.hist(df['ECR'], density=True, histtype='step')
plt.show()

In [None]:
plt.scatter(df['Video_len'], df['NAWP'], alpha=0.1)
plt.show()

In [None]:
plt.scatter(df['Video_len'], df['ECR'], alpha=0.1)
plt.show()

In [None]:
plt.scatter(df['NAWP'], df['ECR'], alpha=0.1)
plt.show()

# Convert video to tensors

In [14]:
sample = df.sample(n=1).iloc[0]
sample

In [16]:
def transfrom_image(frames: np.array, target_size=TARGET_FRAME_SIZE, target_n_frames=TARGET_N_FRAME):
    resized_frames = np.array([cv2.resize(frame, target_size, interpolation=cv2.INTER_LINEAR) for frame in frames])
    # Check if padding is needed
    n_frame = resized_frames.shape[0]
    if n_frame < target_n_frames:
        padding_needed = target_n_frames - n_frame
        padding = np.zeros((padding_needed, target_size[1], target_size[0], 3), dtype=np.uint8)
        resized_frames = np.concatenate((resized_frames, padding), axis=0)
    elif n_frame > target_n_frames:
        resized_frames = resized_frames[:target_n_frames]
    
    return resized_frames

In [None]:
def resample_audio(audio, fixed_length=TARGET_AUDIO_LENGTH):
    length = audio.shape[0]
    
    if length != fixed_length:
        resampled_audio = np.array([resample(audio[:, channel], fixed_length) for channel in range(2)]).T
    else:
        resampled_audio = audio
        
    if len(resampled_audio) < fixed_length:
        padding_needed = fixed_length - len(resampled_audio)
        padding = np.zeros((padding_needed, 2))
        resampled_audio = np.vstack([resampled_audio, padding])

    return resampled_audio

In [None]:
from moviepy import VideoFileClip
import numpy as np


def video_to_tensor(path, video_fps=VIDEO_FPS, audio_fps=AUDIO_FPS):
    # Load the video file
    video_clip = VideoFileClip(path)

    # Extract frames    
    frames = np.array(list(video_clip.iter_frames(fps=video_fps, dtype="uint8"))) # Shape: (num_frames, height, width, 3)
    
    frames = np.transpose(frames, (3, 0, 1, 2))

    # Extract audio as numpy array
    audio = video_clip.audio
    if audio is not None:
        audio_samples = np.array(list(audio.iter_frames(fps=audio_fps)))
    else:
        audio_samples = None

    # Close video to free resources
    video_clip.close()
    
    frames = transfrom_image(frames)
    audio_samples = resample_audio(audio_samples)
    
    return frames, audio_samples


frames, audio_samples = video_to_tensor(sample['Video'], VIDEO_FPS, AUDIO_FPS)
# Print shapes
print("Frames shape:", frames.shape)  # e.g., (num_frames, height, width, 3)
if audio_samples is not None:
    print("Audio shape:", audio_samples.shape)  # e.g., (num_audio_samples, num_channels)

In [17]:
plt.imshow(frames[-1])

# import sounddevice as sd # remember to install libportaudio2
# audio_samples = audio_samples / np.max(np.abs(audio_samples))
# print("Playing audio...")
# sd.play(audio_samples, samplerate=AUDIO_FPS)
# sd.wait()

# Save tensor

In [19]:
def process_row(row):
    name = row.name # id
    video_path = row['Video']
    ecr = row['ECR']
    nawp = row['NAWP']
    label = (ecr, nawp)
    
    # transform
    video, audio = video_to_tensor(video_path, VIDEO_FPS, AUDIO_FPS)
    # permute to torch format
    video = np.transpose(video, (0, 3, 1, 2))
    
    # target transform
    label = np.array(label)
    
    save_dir = os.path.join(TO_PATH, row['Set'])
    
    np.savez_compressed(os.path.join(save_dir, f"{name}_fr.npz"), video)
    np.savez_compressed(os.path.join(save_dir, f"{name}_audio.npz"), audio)
    np.savez_compressed(os.path.join(save_dir, f"{name}_label.npz"), label)
    
    return 1

def main(df: pd.DataFrame, num_processes=None):
    with mp.Pool(processes=num_processes if num_processes is not None else mp.cpu_count() - 4) as pool:
        # Use imap for incremental updates with tqdm
        results = list(
            tqdm(
                pool.imap(process_row, (row for _, row in df.iterrows())), 
                total=len(df), 
                desc=f"Processing DF"
            )
        )
    return results

In [None]:
os.makedirs(os.path.join(TO_PATH, "train"), exist_ok=True)
os.makedirs(os.path.join(TO_PATH, "test"), exist_ok=True)

In [None]:
sub_df = df.sample(frac=SUB_FRACTION)

In [None]:
main(sub_df, num_processes=4)

# Create dataset

In [None]:
# import torch
# from torch.utils.data import Dataset
# from torchvision.transforms import ToTensor

# class CustomVideoDataset(Dataset):
#     def __init__(self, df: pd.DataFrame, transform=None, target_transform=None, video_fps=VIDEO_FPS, audio_fps=AUDIO_FPS):
#         self.df = df
#         self.transform = transform
#         self.video_fps = video_fps
#         self.audio_fps = audio_fps
#         self.target_transform = target_transform

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         try:
#             row = self.df.iloc[idx]
#         except KeyError:
#             return None        
        
#         video_path = row['Video']
#         ecr = row['ECR']
#         nawp = row['NAWP']
#         label = (ecr, nawp)
        
#         if self.transform:
#             video, audio = self.transform(video_path, self.video_fps, self.audio_fps)
#         if self.target_transform:
#             label = self.target_transform(ecr, nawp)
        
#         # permute frame
#         video = torch.permute(torch.tensor(video), (0, 3, 1, 2))
        
#         return (video, audio), label
    
# train_set = CustomVideoDataset(df[df['Set'] == 'train'], transform=video_to_tensor)
# test_set = CustomVideoDataset(df[df['Set'] == 'test'], transform=video_to_tensor)

# len(train_set), len(test_set)

In [None]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
# test_dataloader = DataLoader(test_set, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)