# Setup

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import glob
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [None]:
TRAIN_VIDEOS_PATH = "/mnt/dat/thes/Train"
TEST_VIDEOS_PATH = "/mnt/dat/thes/Test"

In [None]:
train_df = pd.read_csv("data/SnapUGC/train_out.txt", sep='\t')
train_df['Set'] = 'train'

test_df = pd.read_csv("data/SnapUGC/test_out.txt", sep='\t')
test_df['Set'] = 'test'

df = pd.concat([train_df, test_df])

display(df)
len(train_df), len(test_df), len(df)

# Only select video with duration 10-60s

In [None]:
df = df[(df['Video_len'] >= 10) & (df['Video_len'] <= 60)]
df

# Normalize order to 0-1

In [None]:
mmscl = MinMaxScaler()

df['ECR'] = mmscl.fit_transform(df['order of ECR'].to_numpy()[:, np.newaxis])[:, 0]
df['NAWP'] = mmscl.fit_transform(df['order of NAWP'].to_numpy()[:, np.newaxis])[:, 0]
df

# Select filtered videos

In [None]:
train_videos = glob.glob(os.path.join(TRAIN_VIDEOS_PATH, "*.mp4"))
test_videos = glob.glob(os.path.join(TEST_VIDEOS_PATH, "*.mp4"))

len(train_videos), len(test_videos)

In [None]:
train_videos = pd.DataFrame({"Id": [s[s.rfind("/")+1:s.rfind(".mp4")] for s in train_videos],
                             "Video": train_videos,
                             "Set": "train"})
test_videos = pd.DataFrame({"Id": [s[s.rfind("/")+1:s.rfind(".mp4")] for s in test_videos],
                             "Video": test_videos,
                             "Set": "test"})

videos_df = pd.concat([train_videos, test_videos])
videos_df

In [None]:
df = pd.merge(df, videos_df, how='inner', on=['Id', 'Set'])
df

# Data exploration

In [None]:
plt.hist(df['NAWP'], density=True, histtype='step')
plt.hist(df['ECR'], density=True, histtype='step')
plt.show()

In [None]:
plt.scatter(df['Video_len'], df['NAWP'], alpha=0.1)
plt.show()

In [None]:
plt.scatter(df['Video_len'], df['ECR'], alpha=0.1)
plt.show()

In [None]:
plt.scatter(df['NAWP'], df['ECR'], alpha=0.1)
plt.show()

# Convert video to tensors

In [None]:
VIDEO_FPS = 5
AUDIO_FPS = 8000

In [None]:
from moviepy import VideoFileClip
import numpy as np


def video_to_tensor(path, video_fps, audio_fps):
    # Load the video file
    video_clip = VideoFileClip(path)

    # Extract frames    
    frames = np.array(list(video_clip.iter_frames(fps=video_fps, dtype="uint8"))) # Shape: (num_frames, height, width, 3)

    # Extract audio as numpy array
    audio = video_clip.audio
    if audio is not None:
        audio_samples = np.array(list(audio.iter_frames(fps=audio_fps)))
    else:
        audio_samples = None

    # Close video to free resources
    video_clip.close()
    
    return frames, audio_samples


frames, audio_samples = video_to_tensor(df.iloc[2]['Video'], VIDEO_FPS, AUDIO_FPS)
# Print shapes
print("Frames shape:", frames.shape)  # e.g., (num_frames, height, width, 3)
if audio_samples is not None:
    print("Audio shape:", audio_samples.shape)  # e.g., (num_audio_samples, num_channels)

In [None]:
import sounddevice as sd
audio_samples = audio_samples / np.max(np.abs(audio_samples))
print("Playing audio...")
sd.play(audio_samples, samplerate=AUDIO_FPS)
sd.wait()

In [None]:
plt.imshow(frames[0])

In [None]:
# import librosa
# import numpy as np
# import matplotlib.pyplot as plt
# import librosa.display

# # Assume audio_samples is from your `video_to_tensor()` function
# if audio_samples is not None:
#     # Ensure mono if audio has multiple channels
#     if len(audio_samples.shape) == 2:  # Stereo
#         audio_samples = np.mean(audio_samples, axis=1)  # Convert to mono

#     # Normalize audio to range [-1, 1]
#     audio_samples = audio_samples / np.max(np.abs(audio_samples))
    
#     # Sampling rate from AUDIO_FPS
#     sampling_rate = AUDIO_FPS

#     # Compute MFCCs
#     n_mfcc = 13  # Number of coefficients
#     mfccs = librosa.feature.mfcc(y=audio_samples, sr=sampling_rate, n_mfcc=n_mfcc)

#     # Print MFCC shape
#     print("MFCCs shape:", mfccs.shape)  # (n_mfcc, time_frames)

#     # Plot MFCCs
#     plt.figure(figsize=(10, 4))
#     librosa.display.specshow(mfccs, x_axis='time', sr=sampling_rate)
#     plt.colorbar(format='%+2.0f dB')
#     plt.title('MFCC')
#     plt.tight_layout()
#     plt.show()
# else:
#     print("No audio found in the video.")
