In [12]:
import torch
# Choose the `slow_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)
# Set to GPU or CPU
device = "cuda"
model = model.eval()
model = model.to(device)
# json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
# try: urllib.URLopener().retrieve(json_url, json_filename)
# except: urllib.request.urlretrieve(json_url, json_filename)
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)
# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 50


# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = 10
print(clip_duration)
video_path = '/l/users/fathinah.izzati/Synchformer/output/tnj_test.mp4/segment_001.mp4'
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# # Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)
# Dictionary to store the embeddings
embeddings = {}

def hook_fn(module, input, output):
    embeddings['feature'] = output.detach()

# Register the hook to the 'blocks[4]' layer
target_layer = model.blocks[4]
hook = target_layer.register_forward_hook(hook_fn)

# Forward pass to trigger the hook
with torch.no_grad():
    _ = model(inputs[None, ...])

hook.remove()

# Access the embeddings
feature_embeddings = embeddings['feature']
print("Embeddings shape:", feature_embeddings.shape)

Using cache found in /home/fathinah.izzati/.cache/torch/hub/facebookresearch_pytorchvideo_main


10
Embeddings shape: torch.Size([1, 2048, 50, 8, 8])


In [2]:
import torch
import json
import os
import urllib.request
import numpy as np
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Compose, Lambda

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)
# ---------------------------- Setup and Configuration ----------------------------

# Choose the `slow_r50` model from PyTorch Hub
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)

# Set device to GPU if available, else CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.eval().to(device)


# Define transformation parameters
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8#20        # Window size: current frame + next 7 frames
sampling_rate = 1      # Since we're sampling at 5 fps, a sampling rate of 1 maintains the rate
frames_per_second = 5  # Desired frame rate


# Define the transformation pipeline specific to the slow_r50 model
transform = ApplyTransformToKey(
    key="video",
    transform=Compose([
        UniformTemporalSubsample(num_frames),  # Ensure the clip has exactly `num_frames`
        Lambda(lambda x: x / 255.0),           # Normalize pixel values to [0, 1]
        NormalizeVideo(mean, std),             # Normalize using mean and std
        ShortSideScale(size=side_size),        # Scale the shorter side to `side_size`
        CenterCropVideo(crop_size=(crop_size, crop_size))  # Center crop to `crop_size`
    ])
)

# Path to your input directory containing videos
input_directory = '/l/users/fathinah.izzati/Synchformer/output/tnj_test.mp4/'
output_directory = '/l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Path to your input video
# video_path = '/l/users/fathinah.izzati/datasets/data-videomme-videochunked01/0ag_Qi5OEd0.mp4'


# Dictionary to store embeddings from the hook
embeddings_dict = {}

def hook_fn(module, input, output):
    """
    Hook function to capture the output of the target layer.
    """
    embeddings_dict['feature'] = output.detach()

# Register the hook to the 5th block (index 4) of the model
target_layer = model.blocks[4]
hook = target_layer.register_forward_hook(hook_fn)

# ---------------------------- Embedding Extraction Loop ----------------------------
# Iterate over each video file in the input directory
for video_file in os.listdir(input_directory):
    if video_file.endswith('.mp4'):
        video_path = os.path.join(input_directory, video_file)
        
        # Initialize the EncodedVideo helper class and load the video
        video = EncodedVideo.from_path(video_path)
        
        # Total duration and frame count
        total_seconds = 10
        total_frames = frames_per_second * total_seconds  # 50 frames for 10 seconds at 5 fps

        # List to store all embeddings
        embeddings = []

        # Iterate over each frame to extract embeddings
        for i in range(total_frames):
            # Calculate the start and end times for the current window
            start_sec = i / frames_per_second
            clip_duration = (num_frames * sampling_rate) / frames_per_second  # 20 frames / 5 fps = 4 seconds
            end_sec = start_sec + clip_duration

            # Ensure end_sec does not exceed the video's total duration
            end_sec = min(end_sec, total_seconds)

            # Extract the clip from the video
            video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

            # Apply the transformation pipeline
            video_data = transform(video_data)

            # Move the video tensor to the desired device
            inputs = video_data["video"].to(device)

            # Forward pass through the model to trigger the hook
            with torch.no_grad():
                _ = model(inputs[None, ...])  # Add batch dimension

            # Retrieve the embedding from the hook
            embedding = embeddings_dict.get('feature')

            if embedding is not None:
                # Move embedding to CPU and detach from the computation graph
                embeddings.append(embedding.cpu())
            else:
                print(f"Warning: No embedding extracted for frame {i} in video {video_file}.")

        # Concatenate all frame embeddings along the temporal dimension
        if len(embeddings) > 0:
            combined_embeddings = torch.cat(embeddings, dim=0)
            print('combined_embedding', combined_embeddings.shape)
            
            # Save the combined embeddings to a .npy file
            output_path = os.path.join(output_directory, f"{video_file.split('.')[0]}/embeddings2.npy")
            np.save(output_path, combined_embeddings.numpy())
            print(f"Saved embeddings for video {video_file} to {output_path}")
        else:
            print(f"Warning: No embeddings were collected for video {video_file}.")

# Remove the hook after extraction
hook.remove()

Using cache found in /home/fathinah.izzati/.cache/torch/hub/facebookresearch_pytorchvideo_main


combined_embedding torch.Size([50, 2048, 8, 8, 8])
Saved embeddings for video segment_014.mp4 to /l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/segment_014/embeddings2.npy
combined_embedding torch.Size([50, 2048, 8, 8, 8])
Saved embeddings for video segment_013.mp4 to /l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/segment_013/embeddings2.npy
combined_embedding torch.Size([50, 2048, 8, 8, 8])
Saved embeddings for video segment_026.mp4 to /l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/segment_026/embeddings2.npy
combined_embedding torch.Size([50, 2048, 8, 8, 8])
Saved embeddings for video segment_021.mp4 to /l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/segment_021/embeddings2.npy
combined_embedding torch.Size([50, 2048, 8, 8, 8])
Saved embeddings for video segment_028.mp4 to /l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/seg

In [12]:
loaded_embeddings = np.load(output_path)
combined_embeddings = torch.from_numpy(loaded_embeddings)
# Reshape the tensor to have 3 dimensions
# The new shape will be [50, 2048, 20 * 8 * 8], resulting in [50, 2048, 1280]
combined_embeddings_3d = combined_embeddings.view(50, 2048, -1)

# Print the shape of the new 3D tensor
print(f"Shape of the combined 3D embeddings tensor: {combined_embeddings_3d.shape}")

Shape of the combined 3D embeddings tensor: torch.Size([50, 2048, 512])


### Combining audio+video from inference result

In [1]:
[from moviepy.editor import VideoFileClip, AudioFileClip
import os

# Define the paths for audio and video directories
audio_dir = "/l/users/fathinah.izzati/coco-mulla-repo/demo/output/expe_tnj_2"
video_dir = "/l/users/fathinah.izzati/Synchformer/vis"
output_dir = "/l/users/fathinah.izzati/combined_output"  # Directory to save combined videos

# Create output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Loop through each segment and combine the audio and video
for i in range(1, 31):
    segment_num = f"{i:03d}"  # Format the segment number to 3 digits
    audio_path = os.path.join(audio_dir, f"segment_{segment_num}", "video-only.wav")
    video_path = os.path.join(video_dir, f"segment_{segment_num}_25fps_256side_16000hz.mp4")
    
    # Check if both audio and video files exist
    if os.path.exists(audio_path) and os.path.exists(video_path):
        # Load video and audio clips
        video_clip = VideoFileClip(video_path)
        audio_clip = AudioFileClip(audio_path)
        
        # Set audio of video to the audio clip
        video_with_audio = video_clip.set_audio(audio_clip)
        
        # Define output path
        output_path = os.path.join(output_dir, f"combined_segment_{segment_num}.mp4")
        
        # Write the result to a file
        video_with_audio.write_videofile(output_path, codec="libx264", audio_codec="aac", fps=25)
        
        print(f"Combined video and audio for segment {segment_num} saved to {output_path}")
    else:
        print(f"Audio or video file for segment {segment_num} not found.")]

Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_001.mp4.
MoviePy - Writing audio in combined_segment_001TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_001.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_001.mp4
Combined video and audio for segment 001 saved to /l/users/fathinah.izzati/combined_output/combined_segment_001.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_002.mp4.
MoviePy - Writing audio in combined_segment_002TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_002.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_002.mp4
Combined video and audio for segment 002 saved to /l/users/fathinah.izzati/combined_output/combined_segment_002.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_003.mp4.
MoviePy - Writing audio in combined_segment_003TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_003.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_003.mp4
Combined video and audio for segment 003 saved to /l/users/fathinah.izzati/combined_output/combined_segment_003.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_004.mp4.
MoviePy - Writing audio in combined_segment_004TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_004.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_004.mp4
Combined video and audio for segment 004 saved to /l/users/fathinah.izzati/combined_output/combined_segment_004.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_005.mp4.
MoviePy - Writing audio in combined_segment_005TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_005.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_005.mp4
Combined video and audio for segment 005 saved to /l/users/fathinah.izzati/combined_output/combined_segment_005.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_006.mp4.
MoviePy - Writing audio in combined_segment_006TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_006.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_006.mp4
Combined video and audio for segment 006 saved to /l/users/fathinah.izzati/combined_output/combined_segment_006.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_007.mp4.
MoviePy - Writing audio in combined_segment_007TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_007.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_007.mp4
Combined video and audio for segment 007 saved to /l/users/fathinah.izzati/combined_output/combined_segment_007.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_008.mp4.
MoviePy - Writing audio in combined_segment_008TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_008.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_008.mp4
Combined video and audio for segment 008 saved to /l/users/fathinah.izzati/combined_output/combined_segment_008.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_009.mp4.
MoviePy - Writing audio in combined_segment_009TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_009.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_009.mp4
Combined video and audio for segment 009 saved to /l/users/fathinah.izzati/combined_output/combined_segment_009.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_010.mp4.
MoviePy - Writing audio in combined_segment_010TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_010.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_010.mp4
Combined video and audio for segment 010 saved to /l/users/fathinah.izzati/combined_output/combined_segment_010.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_011.mp4.
MoviePy - Writing audio in combined_segment_011TEMP_MPY_wvf_snd.mp4


                                                                  

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_011.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_011.mp4
Combined video and audio for segment 011 saved to /l/users/fathinah.izzati/combined_output/combined_segment_011.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_012.mp4.
MoviePy - Writing audio in combined_segment_012TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_012.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_012.mp4
Combined video and audio for segment 012 saved to /l/users/fathinah.izzati/combined_output/combined_segment_012.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_013.mp4.
MoviePy - Writing audio in combined_segment_013TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_013.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_013.mp4
Combined video and audio for segment 013 saved to /l/users/fathinah.izzati/combined_output/combined_segment_013.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_014.mp4.
MoviePy - Writing audio in combined_segment_014TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_014.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_014.mp4
Combined video and audio for segment 014 saved to /l/users/fathinah.izzati/combined_output/combined_segment_014.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_015.mp4.
MoviePy - Writing audio in combined_segment_015TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_015.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_015.mp4
Combined video and audio for segment 015 saved to /l/users/fathinah.izzati/combined_output/combined_segment_015.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_016.mp4.
MoviePy - Writing audio in combined_segment_016TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_016.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_016.mp4
Combined video and audio for segment 016 saved to /l/users/fathinah.izzati/combined_output/combined_segment_016.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_017.mp4.
MoviePy - Writing audio in combined_segment_017TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_017.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_017.mp4
Combined video and audio for segment 017 saved to /l/users/fathinah.izzati/combined_output/combined_segment_017.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_018.mp4.
MoviePy - Writing audio in combined_segment_018TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_018.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_018.mp4
Combined video and audio for segment 018 saved to /l/users/fathinah.izzati/combined_output/combined_segment_018.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_019.mp4.
MoviePy - Writing audio in combined_segment_019TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_019.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_019.mp4
Combined video and audio for segment 019 saved to /l/users/fathinah.izzati/combined_output/combined_segment_019.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_020.mp4.
MoviePy - Writing audio in combined_segment_020TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_020.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_020.mp4
Combined video and audio for segment 020 saved to /l/users/fathinah.izzati/combined_output/combined_segment_020.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_021.mp4.
MoviePy - Writing audio in combined_segment_021TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_021.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_021.mp4
Combined video and audio for segment 021 saved to /l/users/fathinah.izzati/combined_output/combined_segment_021.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_022.mp4.
MoviePy - Writing audio in combined_segment_022TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_022.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_022.mp4
Combined video and audio for segment 022 saved to /l/users/fathinah.izzati/combined_output/combined_segment_022.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_023.mp4.
MoviePy - Writing audio in combined_segment_023TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_023.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_023.mp4
Combined video and audio for segment 023 saved to /l/users/fathinah.izzati/combined_output/combined_segment_023.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_024.mp4.
MoviePy - Writing audio in combined_segment_024TEMP_MPY_wvf_snd.mp4


                                                                  

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_024.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_024.mp4
Combined video and audio for segment 024 saved to /l/users/fathinah.izzati/combined_output/combined_segment_024.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_025.mp4.
MoviePy - Writing audio in combined_segment_025TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_025.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_025.mp4
Combined video and audio for segment 025 saved to /l/users/fathinah.izzati/combined_output/combined_segment_025.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_026.mp4.
MoviePy - Writing audio in combined_segment_026TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_026.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_026.mp4
Combined video and audio for segment 026 saved to /l/users/fathinah.izzati/combined_output/combined_segment_026.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_027.mp4.
MoviePy - Writing audio in combined_segment_027TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_027.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_027.mp4
Combined video and audio for segment 027 saved to /l/users/fathinah.izzati/combined_output/combined_segment_027.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_028.mp4.
MoviePy - Writing audio in combined_segment_028TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_028.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_028.mp4
Combined video and audio for segment 028 saved to /l/users/fathinah.izzati/combined_output/combined_segment_028.mp4
Moviepy - Building video /l/users/fathinah.izzati/combined_output/combined_segment_029.mp4.
MoviePy - Writing audio in combined_segment_029TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video /l/users/fathinah.izzati/combined_output/combined_segment_029.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /l/users/fathinah.izzati/combined_output/combined_segment_029.mp4
Combined video and audio for segment 029 saved to /l/users/fathinah.izzati/combined_output/combined_segment_029.mp4
Audio or video file for segment 030 not found.


### Training

In [1]:
# 4. create the npy of music and put it into training_input. 

import os

path = '/l/users/fathinah.izzati/Synchformer/vis'
filenames = [path+'/'+i for i in os.listdir(path) if i.endswith('.wav') ]
from coco_mulla.utilities.encodec_utils import extract_rvq
import librosa
import torch
import numpy as np
device='cuda'

mix_output_path = '/l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_train'
sr =16_000
for audio_path in filenames[]:
    print(audio_path)
    name = audio_path.split('/')[-1].split('.')[0]
    print(name)
    wav, _ = librosa.load(audio_path, sr=sr, mono=True)
    wav = torch.from_numpy(wav).to(device)[None, None, ...]
    mix_rvq = extract_rvq(wav, sr=sr)
    print(mix_rvq.shape)
    break
    # np.save(mix_output_path+f'/{name}/music.npy', mix_rvq.cpu().numpy())

  from .autonotebook import tqdm as notebook_tqdm


/l/users/fathinah.izzati/Synchformer/vis/segment_251_25fps_256side_16000hz.wav
segment_251_25fps_256side_16000hz
torch.Size([4, 252])


In [2]:
for audio_path in filenames[3:]:
    print(audio_path)
    name = audio_path.split('/')[-1].split('.')[0]
    print(name)
    wav, _ = librosa.load(audio_path, sr=sr, mono=True)
    wav = torch.from_numpy(wav).to(device)[None, None, ...]
    mix_rvq = extract_rvq(wav, sr=sr)
    print(mix_rvq.shape)
    break

/l/users/fathinah.izzati/Synchformer/vis/segment_089_25fps_256side_16000hz.wav
segment_089_25fps_256side_16000hz
torch.Size([4, 252])


In [14]:
audio_path = '/l/users/fathinah.izzati/datasets/swimming_125.wav'
wav, _ = librosa.load(audio_path, sr=sr, mono=True)
wav = torch.from_numpy(wav).to(device)[None, None, ...]
mix_rvq = extract_rvq(wav, sr=sr)
print(mix_rvq.shape)

torch.Size([4, 205])


In [8]:
205/8

25.625

In [9]:
90/3

30.0

In [10]:
252/10

25.2

In [15]:
# Define the base path and file to write
base_path = "/l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/segment_"
output_file = "test.lst"

# Open the file in write mode
with open(output_file, "w") as f:
    # Loop to generate 500 lines with the required format
    for i in range(1, 30):
        segment_path = f"{base_path}{i:03d}"
        line = f"{segment_path} 0 10\n"
        f.write(line)

In [1]:
## 6. fix train.py 

import argparse
from torch.utils.tensorboard import SummaryWriter

import torch.distributed as dist
from torch.multiprocessing import spawn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from coco_mulla.utilities.trainer_utils import Trainer

import torch
import torch.nn as nn
import os
from config import TrainCfg
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from tqdm import tqdm

from coco_mulla.data_loader.dataset_sampler import Dataset, collate_fn
from coco_mulla.models import CoCoMulla

device = "cuda"
N_GPUS = 1


def _get_free_port():
    import socketserver
    with socketserver.TCPServer(('localhost', 0), None) as s:
        return s.server_address[1]



def get_dataset(dataset_split, sampling_strategy, sampling_prob):

    file_lst = ["data/text/musdb18_full.lst",
                "data/text/closed_dataset_fm_full.lst"]
    splits = [
        [1],
        [0],
        [0, 1],
    ]
    dataset = Dataset(
        rid=0, # No distributed rank needed
        path_lst=[dataset_split],
        sampling_prob=sampling_prob,
        sampling_strategy=sampling_strategy,
        cfg=TrainCfg)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=TrainCfg.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        num_workers=0,
        # sampler=DistributedSampler(dataset),
        pin_memory=True,
        drop_last=True)

    return dataset, dataloader


def train_dist(replica_id, replica_count, port, model_dir, args):
    print('masuk sini')
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count)
    device = torch.device('cuda', replica_id)
    print(device)
    torch.cuda.set_device(device)
    model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
    model.set_training()
    model = DDP(model, [replica_id])
    dataset, dataloader = get_dataset(rid=replica_id, dataset_split=args.dataset,
                                      sampling_strategy=args.sampling_strategy,
                                      sampling_prob=[args.sampling_prob_a, args.sampling_prob_b])

    # train(replica_id, model, dataset, dataloader, device, model_dir,
    #       args.learning_rate)


def loss_fn(outputs, y):
    prob = outputs.logits
    mask = outputs.mask
    prob = prob[mask]
    y = y[mask]
    prob = prob.view(-1, 2048)
    return nn.CrossEntropyLoss()(prob, y)


def train(model, dataset, dataloader, device, model_dir, learning_rate):
    # optimizer and lr scheduler
    num_steps = len(dataloader)
    epochs = TrainCfg.epoch
    rng = np.random.RandomState(569)
    writer = SummaryWriter(model_dir, flush_secs=20)

    trainer = Trainer(params=model.parameters(), lr=learning_rate, num_epochs=epochs, num_steps=num_steps)

    model = model.to(device)
    step = 0
    for e in range(0, epochs):
        mean_loss = 0
        n_element = 0
        model.train()

        dl = tqdm(dataloader, desc=f"Epoch {e}")
        r = rng.randint(0, 233333)
        dataset.reset_random_seed(r, e)
        for i, batch in enumerate(dl):
            desc = batch["desc"]
            music = batch["music"].to(device).long()
            video = batch["video"].to(device).long()
            cond_mask = batch["cond_mask"].to(device).long()

            batch_1 = {
                "music": music,
                "video": video,
                "cond_mask": cond_mask,
                "desc": desc,

            }
            # with autocast:
            outputs = model(**batch_1)
            print("==========================================")
            print(outputs)
            
            r_loss = loss_fn(outputs, music)

            grad_1, lr_1 = trainer.step(r_loss, model.parameters())

            step += 1
            n_element += 1
            writer.add_scalar("r_loss", r_loss.item(), step)
            writer.add_scalar("grad_1", grad_1, step)
            writer.add_scalar("lr_1", lr_1, step)

            mean_loss += r_loss.item()

        mean_loss = mean_loss / n_element
        with torch.no_grad():
            writer.add_scalar('train/mean_loss', mean_loss, step)
            model.save_weights(os.path.join(model_dir, f"diff_{e}_end.pth"))


def main(args):
    experiment_folder = args.experiment_folder
    experiment_name = args.experiment_name

    if not os.path.exists(experiment_folder):
        os.mkdir(experiment_folder)
    model_dir = os.path.join(experiment_folder, experiment_name)
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    world_size = N_GPUS
    port = _get_free_port()
    spawn(train_dist, args=(world_size, port, model_dir, args), nprocs=world_size, join=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 14,
    "experiment_folder": "/l/users/fathinah.izzati/coco-mulla-repo/expe",
    "experiment_name": "experiment_tnj_3",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/tnj/tnj.prompt.txt",
    'sampling_strategy':'prob-based',
    "dataset": '/l/users/fathinah.izzati/coco-mulla-repo/test.lst',
    'learning_rate':0.05

}
args = SimpleNamespace(**args)

experiment_folder = args.experiment_folder
experiment_name = args.experiment_name
if not os.path.exists(experiment_folder):
    os.mkdir(experiment_folder)
model_dir = os.path.join(experiment_folder, experiment_name)
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    
dataset, dataloader = get_dataset(
        dataset_split=args.dataset,
        sampling_strategy=args.sampling_strategy,
        sampling_prob=None
    )
# model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
# model.set_training()
# train(model, dataset, dataloader, device, model_dir, args.learning_rate)


### original
### music shape
### drums before torch.Size([1, 4, 1001])
### drums after self.encodec_emb torch.Size([1, 1001, 12])
###  chords shape torch.Size([1, 1001, 37])
### cond concat on the three torch.Size([1, 1001, 61])
### mask embed per layer torch.Size([1, 1001, 61])
# ccond_mask torch.Size([1, 1001, 61])
# Inside CPTransfoermer forward
# torch.Size([1, 4, 1000])


## adapter
## video video.shape b4 torch.Size([1, 14, 251])
## video after self.encodec_emb torch.Size([1, 251, 14])
# cond_mask.shape torch.Size([1, 251, 14])
# mask embedding per layer torch.Size([1, 251, 14])
## music torch.Size([1, 4, 251])



0 /l/users/fathinah.izzati/coco-mulla-repo/test.lst
num of files 29
samling strategy prob-based [0.0, 0.8]


In [7]:
for i, batch in enumerate(dataloader):
        print(batch.keys())
        desc = batch["desc"]
        music = batch["music"]
        print('music', music.shape)
        video = batch["video"]
        print('video', video.shape)
        video = batch["videmb"]
        print('videmb', video.shape)
        break

NameError: name 'dataloader' is not defined

In [4]:
# 7. fix inference.py

import argparse
import librosa

from coco_mulla.models import CoCoMulla ## Change this one!!
from coco_mulla.utilities import *
from coco_mulla.utilities.encodec_utils import save_rvq

from coco_mulla.utilities.sep_utils import separate
from config import TrainCfg  ##change this one!!
import torch.nn.functional as F
import numpy as np 

device = get_device()

model = CoCoMulla(TrainCfg.sample_sec,           ## Change this one!!
                      num_layers=48,
                      latent_dim=14).to(device)
model.load_weights("/l/users/fathinah.izzati/coco-mulla-repo/expe/experiment_tnj_2/diff_4_end.pth",)
model.eval()
def generate(batch):
    print(batch)
    with torch.no_grad():
        gen_tokens = model(**batch)

    return gen_tokens


def generate_mask(xlen):
    names = ["video-only"]
    mask = torch.ones([1, 1, xlen]).to(device)
    # mask[1, 1] = 1
    # mask[2, 0] = 1
    # mask[3] += 1
    return mask, names


def load_data(video_path, offset):
    sr = TrainCfg.sample_rate
    res = TrainCfg.frame_res
    sample_sec = TrainCfg.sample_sec
    video_rvq = np.load(video_path)
    print('video', video_rvq.shape)
    drums_rvq = crop(video_rvq[None, ...], "video_rvq", sample_sec, res, offset=offset)
    video = torch.from_numpy(drums_rvq).to(device).long()
    print('video', video.shape)
    return video


def crop(x, mode, sample_sec, res, offset=0):
    xlen = x.shape[-1]
    st = offset * res
    ed = int((offset + sample_sec) * res) + 1
    return x[:, :, st: ed]


def save_pred(output_folder, tags, pred):
    mkdir(output_folder)
    output_list = [os.path.join(output_folder, tag) for tag in tags]
    save_rvq(output_list=output_list, tokens=pred)


def wrap_batch(video, cond_mask, prompt):
    num_samples = len(cond_mask)
    video = video.repeat(num_samples, 1, 1)
    prompt = [prompt] * num_samples
    batch = {
        "music": None,
        "desc": prompt,
        "video": video,
        "num_samples": num_samples,
        "cond_mask": cond_mask,
        "mode": "inference",
    }
    return batch


def inference(args):
    video = load_data(video_path=args.video_path,
                                       offset=args.offset)
    cond_mask, names = generate_mask(video.shape[-1])
    batch = wrap_batch(video, cond_mask, read_lst(args.prompt_path)[0])
    print(batch)
    pred = generate(
                    batch=batch)
    save_pred(output_folder=args.output_folder,
              tags=names,
              pred=pred)





OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacty of 23.62 GiB of which 8.94 MiB is free. Process 3896909 has 456.00 MiB memory in use. Process 3959383 has 3.96 GiB memory in use. Including non-PyTorch memory, this process has 19.03 GiB memory in use. Of the allocated memory 18.55 GiB is allocated by PyTorch, and 288.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from types import SimpleNamespace

In [None]:
for i in os.listdir('/l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test'):
    print(i)
    args = {
            # "model_path":"/l/users/fathinah.izzati/coco-mulla-repo/expe/experiment_tnj_2/diff_4_end.pth",
            "num_layers": 48,
            "latent_dim": 14,
            "output_folder": f"/l/users/fathinah.izzati/coco-mulla-repo/demo/output/expe_tnj_2/{i}",
            "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/tnj/tnj.prompt.txt",
            "video_path": f"/l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_test/{i}/video.npy",
            "offset": 0
        }
    args = SimpleNamespace(**args)
    inference(args)

segment_024
video (14, 768)
video torch.Size([1, 14, 251])
{'music': None, 'desc': ['A realistic and high quality soundtrack and sound effect for the video'], 'video': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], device='cuda:0'), 'num_samples': 1, 'cond_mask': tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1

TypeError: generate() got an unexpected keyword argument 'model_path'