# Installation and Setup

**Cloning a copy of our project folder to the google collab **

In [None]:
# Clone the project
!git clone https://github.com/jezsll/ict3104-team09-2023.git

**Installing the necessary dependency and packages **

In [None]:
# Install the required packages
%cd /content/ict3104-team09-2023/
!pip install triton==2.0.0


#!pip install -q diffusers==0.11.1 transformers==4.26.0 bitsandbytes==0.35.4 imageio-ffmpeg xformers
!pip install -q diffusers==0.11.1 transformers==4.26.0 bitsandbytes==0.41.1 imageio-ffmpeg #change bitsandbytes for mmpose

!pip install accelerate
!pip install omegaconf
!pip install einops
!pip install av
!pip install decord

!pip install -r requirements.txt

!pip install  xformers==0.0.19

!pip uninstall nvidia-cudnn-cu11 -y
!pip install nvidia-cudnn-cu11==8.6.0.163


!pip uninstall torch torchvision -y
#!pip install torch torchvision -f https://download.pytorch.org/whl/cu111/torch_stable.html
!pip install torch==2.0.0 torchvision -f https://download.pytorch.org/whl/cu118/torch_stable.html #change to cuda 11.8 for mmpose



**Create checkpoints directory and clone into project**

The checkpoint file will be too large to be included in our github project repo, thus we clone the checkpoint separately

In [None]:
# Create checkpoints directory and clone another repository
%cd /content/ict3104-team09-2023/
%mkdir checkpoints
!git lfs install
!git clone https://huggingface.co/YueMafighting/FollowYourPose_v1
!mv /content/ict3104-team09-2023/FollowYourPose_v1/* /content/ict3104-team09-2023/checkpoints/
!rm -rf FollowYourPose_v1

# Data Exploration Section

**This section allows you to view a playback of the videos.**

1. After loading the cell, click on the dropdown list
2. The video playback of the chosen video file will be shown in the output cell

The videos included in the selection can be found at this path: "/content/ict3104-team09-2023/Data/CharadesVideos2"

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output, Video

# Function to display the selected video
def display_video(video_name):
    clear_output(wait=True)  # Clear previous output, but retain the dropdown
    video_path = os.path.join(folder_path, video_name)
    # Use IPython's Video to display the video
    display(Video(video_path, embed=True))

# Path to the folder containing your videos
folder_path = '/content/ict3104-team09-2023/Data/CharadesVideos2'  # Adjust the path accordingly

# List all video files in the folder and its subfolders
video_files = []
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.lower().endswith(('.mp4', '.avi', '.mov', 'mov')):  # Add other video formats if needed
            video_files.append(os.path.relpath(os.path.join(root, file), start=folder_path))

# Create a dropdown widget for video selection
dropdown = widgets.Dropdown(
    options=video_files,
    description='Select Video:',
    disabled=False,
)

# Output area to display the video
output_area = widgets.Output()

# Display the dropdown and set up its event
def on_dropdown_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        with output_area:
            display_video(change['new'])

dropdown.observe(on_dropdown_change)
display(widgets.VBox([dropdown, output_area]))  # Display the dropdown and output area in a VBox


# Video Cropping Section

**Steps to perform video cropping**
1. After loading the cell, click on the dropdown list
2. Select the desired subfolder
3. Press on the Crop Videos button, the videos in the subfolder will be cropped to 15 seconds, with the size of 512x512 pixels

Note: If you prefer not to overwrite it, please refrain from using the same video.

In [None]:
import os
import moviepy.editor as mp
import ipywidgets as widgets
from IPython.display import display, HTML

# Function to process, crop, and save the video
def process_video(video_path, output_path):
    # Load the video and crop to 15 seconds
    clip = mp.VideoFileClip(video_path)
    clip = clip.subclip(0, 15)

    # Center crop the video to a square shape
    min_side = min(clip.w, clip.h)
    x_center = (clip.w - min_side) / 2
    y_center = (clip.h - min_side) / 2
    clip_cropped = clip.crop(x_center, y_center, x_center + min_side, y_center + min_side)

    # Resize the video to 512x512 pixels
    clip_resized = clip_cropped.resize(height=512, width=512)

    # Save the processed video
    clip_resized.write_videofile(output_path, codec='libx264')

# Function to handle processing when a folder is selected
def process_selected_folder(change):
    global selected_folder_path
    selected_folder_path = change['new']
    with output_area:
        output_area.clear_output()
        display(HTML(f"Selected folder: {selected_folder_path}"))
        crop_button.disabled = False  # Enable the button when a folder is selected

# Function to handle cropping when the button is pressed
def crop_videos(button):
    with output_area:
        output_area.clear_output()
        display(HTML("Processing videos..."))

        # Get the list of subfolders in the selected folder
        subfolders = [f for f in os.listdir(selected_folder_path) if os.path.isdir(os.path.join(selected_folder_path, f))]

        # Process each subfolder and its videos
        for subfolder in subfolders:
            subfolder_path = os.path.join(selected_folder_path, subfolder)

            # Get the list of video files in the subfolder
            video_files = [os.path.join(subfolder_path, file) for file in os.listdir(subfolder_path) if file.lower().endswith(('.mp4', '.avi', '.mov'))]

            # Process each video and save it back to its original folder
            for video_path in video_files:
                base_name, extension = os.path.splitext(os.path.basename(video_path))
                output_file = os.path.join(subfolder_path, f"{base_name}1{extension}")
                process_video(video_path, output_file)

                display(HTML(f"Processed video saved as: {os.path.basename(output_file)}"))

# Path to the main folder containing subfolders with videos
main_folder_path = '/content/ict3104-team09-2023/Data'

# Create a dropdown widget for folder selection
folder_dropdown = widgets.Dropdown(
    options=[os.path.join(main_folder_path, d) for d in os.listdir(main_folder_path) if os.path.isdir(os.path.join(main_folder_path, d))],
    description='Select Folder:',
    disabled=False,
)

# Bind the folder selection to the processing function
folder_dropdown.observe(process_selected_folder, names='value')

# Create a button to trigger cropping
crop_button = widgets.Button(
    description="Crop Videos",
    disabled=True  # Disable initially until folder is selected
)
crop_button.on_click(crop_videos)

# Output area to display processing status
output_area = widgets.Output()

# Display the folder dropdown, crop button, and output area in a VBox
display(widgets.VBox([folder_dropdown, crop_button, output_area]))


# MM Pose Generation Section

**Create mmpose directory and install required package**

In [None]:
# create mmpose directory and install the required package
%mkdir /content/mmpose
%cd /content/mmpose

# install virtual environment
!pip install virtualenv
# create virtual environment
!virtualenv myenv

# install required package
!pip install openmim
!mim install mmengine
!mim install "mmcv>=2.0.1"
!mim install "mmdet>=3.1.0"

In [None]:
# git clone mmpose into the directory
!git clone https://github.com/open-mmlab/mmpose.git

# install requirement.txt file
%cd mmpose
!pip install -e .

**Step to run the mmpose**
1. Select the input video path that you want to convert to skeleton
2. Click run cell to obtain the skeleton video output

In [None]:
# generate mmpose skeleton video method 1
# clearer

INPUT_PATH = '/content/ict3104-team09-2023/Data/CharadesVideos2/0DJ6R/0DJ6R1.mp4'
OUTPUT_PATH = '/content/ict3104-team09-2023/pose_example'

!python /content/mmpose/mmpose/demo/inferencer_demo.py {INPUT_PATH} \
    --pose2d 'human'  \
    --vis-out-dir {OUTPUT_PATH} \
    --black-background


# FID (Fretchet Inception Distance)
A metric to evaluating quality of generated images and performance of genAI generation.

A video that contains the skeleton generated by MMPose superimposed onto the resulting video will be saved in the /content/ict3104-team09-2023/pose_example/ directory.

In [None]:
import cv2
import os
import numpy as np
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from scipy.linalg import sqrtm

# Load pre-trained InceptionV3 model without top classification layers
inception_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')

def preprocess_frames(frames):
    preprocessed_frames = []
    for frame in frames:
        # Preprocess each frame
        frame = cv2.resize(frame, (299, 299))  # Resize frame to (299, 299)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
        frame = np.expand_dims(frame, axis=0)  # Add batch dimension
        frame = preprocess_input(frame)  # Preprocess for InceptionV3
        preprocessed_frames.append(frame)
    return np.vstack(preprocessed_frames)

def calculate_fid(real_frames, generated_frames):
    # Get InceptionV3 feature representations for real and generated frames
    real_features = inception_model.predict(real_frames)
    generated_features = inception_model.predict(generated_frames)

    # Calculate mean and covariance statistics for real and generated features
    mu_real, sigma_real = np.mean(real_features, axis=0), np.cov(real_features, rowvar=False)
    mu_generated, sigma_generated = np.mean(generated_features, axis=0), np.cov(generated_features, rowvar=False)

    # Calculate Frechet Distance between real and generated distributions
    sqrt_term = sqrtm(np.dot(sigma_real, sigma_generated))
    if np.iscomplexobj(sqrt_term):
        sqrt_term = sqrt_term.real
    fid = np.linalg.norm(mu_real - mu_generated) + np.trace(sigma_real + sigma_generated - 2 * sqrt_term)
    return fid

# Indicate the path for real and generated video
real_video_path = "/content/ict3104-team09-2023/Data/CharadesVideos2/0DJ6R/0DJ6R1.mp4"
generated_video_path = "/content/ict3104-team09-2023/pose_example/0DJ6R1.mp4" #change to gif path

# Output video path for video overlay
output_video_path = "/content/ict3104-team09-2023/pose_example/overlay_skeleton.mp4"

# Read frames from videos
real_frames = []
generated_frames = []

cap_real = cv2.VideoCapture(real_video_path)
cap_generated = cv2.VideoCapture(generated_video_path)

# Get the frame width, height, and frame rate of the generated video
frame_width = int(cap_generated.get(3))
frame_height = int(cap_generated.get(4))
frame_rate = int(cap_generated.get(5))

# Define the codec for the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')

# Create the output video writer
output_video = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (frame_width, frame_height))

while True:
    ret, frame_real = cap_real.read()
    ret, frame_generated = cap_generated.read()

    if not ret:
        break

    real_frames.append(frame_real)
    generated_frames.append(frame_generated)

    # Resize the skeleton frame to match the dimensions of the generated video frame
    frame_generated = cv2.resize(frame_generated, (frame_width, frame_height))

    # Overlay the skeleton frame onto the generated frame
    combined_frame = cv2.addWeighted(frame_real, 1, frame_generated, 0.5, 0)

    # Write the combined frame to the output video
    output_video.write(combined_frame)

# Preprocess frames
real_frames_preprocessed = preprocess_frames(real_frames)
generated_frames_preprocessed = preprocess_frames(generated_frames)

# Calculate FID score
fid_score = calculate_fid(real_frames_preprocessed, generated_frames_preprocessed)
print(f"FID Score: {fid_score}")

# Release video capture objects
cap_real.release()
cap_generated.release()
output_video.release()

# Inference Section

In [None]:
!pip install asyncio

**This section allows you to choose an input video from the subfolder, and write in a text prompt as input into the genAI model.**

1. After loading the code, select the desired pre-trained model from the dropdown list and choose a video
2. Enter the prompts
3. Press on the Run button

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output, Markdown
import torch
import yaml
import subprocess
import asyncio

%cd /content/ict3104-team09-2023/

# Load pre-trained genAI model
# Config file path
config_path = '/content/ict3104-team09-2023/configs/'

# List all config files in the folder
config_files = [f for f in os.listdir(config_path) if f.endswith(('.yaml'))]

# Create a dropdown widget for loading pre-trained model
dropdown_pretrained = widgets.Dropdown(
    options=config_files,
    description='Pre-trained:',
    disabled=False,
)

display(widgets.VBox([dropdown_pretrained]))

# Choose videos from the data folder and create input text prompt
# Choose input videos (skeleton_path)
folder_path = '/content/ict3104-team09-2023/pose_example/'

# List all video files in the folder
video_files = [f for f in os.listdir(folder_path) if f.endswith(('.mp4', '.avi', '.mov', 'MOV'))]

# Create a dropdown widget for video selection (allowing multiple selection)
dropdown = widgets.SelectMultiple(
    options=video_files,
    description='Select Videos:',
    disabled=False,
)

display(widgets.VBox([dropdown]))

# Create textarea widget for text prompts
text_prompt = widgets.Textarea(
    description='Prompts:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='150px')
)

def modify_prompt(change):
    # Modify the config file
    selected_file = os.path.join(config_path, dropdown_pretrained.value)
    with open(selected_file, 'r') as yaml_file:
        config = yaml.safe_load(yaml_file)

    # Modify the specific value in the config file
    prompts = [line.strip() for line in change.new.split('\n') if line.strip()]
    config['validation_data']['prompts'] = prompts

    # Save config file after modification
    with open(selected_file, 'w') as yaml_file:
        yaml.dump(
            config,
            yaml_file,
            default_flow_style=False,
            indent=2,  # Use 2 spaces for indentation
            width=float("inf"),  # Disable line wrapping
            allow_unicode=True  # Allow non-ASCII characters
        )

text_prompt.observe(modify_prompt, names='value')
display(widgets.VBox([text_prompt]))

# Create button to run txt2video sequentially for each selected video
async def run_txt2video(_):
    config_file = f"/content/ict3104-team09-2023/configs/{dropdown_pretrained.value}"
    selected_videos = [f"/content/ict3104-team09-2023/pose_example/{video}" for video in dropdown.value]

    for video_path in selected_videos:
        # Build the command
        command = f"TORCH_DISTRIBUTED_DEBUG=DETAIL accelerate launch /content/ict3104-team09-2023/txt2video.py --config={config_file} --skeleton_path={video_path}"

        print("Executing command:", command)

        # Run the command and capture the output using await
        result = await asyncio.to_thread(subprocess.run, command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Print the stdout and stderr
        display(Markdown(f"**Command Output for {video_path}:**\n```\n{result.stdout}\n{result.stderr}\n```"))

run_button = widgets.Button(description="Run txt2video for selected videos")
run_button.on_click(lambda x: asyncio.create_task(run_txt2video(x)))  # Use asyncio.create_task to run the async function
display(widgets.VBox([run_button]))


In [None]:
from IPython.display import display, HTML, clear_output, Image

# Get a list of all subdirectories in the inference folder
inference_folder_path = '/content/ict3104-team09-2023/checkpoints/inference/'
inference_folders = [f for f in os.listdir(inference_folder_path) if os.path.isdir(os.path.join(inference_folder_path, f))]

# Display files in each subdirectory
for folder in inference_folders:
    print(f"Files in the {folder} folder:")
    folder_path = os.path.join(inference_folder_path, folder)
    inference_files = [f for f in os.listdir(folder_path) if f.endswith('.gif')]

    for file in inference_files:
        print(f"Contents of {file}:")
        file_path = os.path.join(folder_path, file)
        display(Image(filename=file_path))
        print('\n')

In [None]:
#Output
import os
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output, Image

# Function to display the output GIF image
def display_gif(gif_name):
    clear_output(wait=True)
    gif_path = get_selected_gif_path(gif_name)
    gif_text_prompt = gif_name.split(".")
    # Display the text prompt above the GIF image
    display(HTML(f"<p style='text-align:left; font-weight:bold; font-size:30px;'>{gif_text_prompt[0]}</p>"))
    # Use IPython's Image to display the GIF image
    display(Image(open(gif_path,'rb').read()))

# Path to the folder containing your videos
folder_path = '/content/ict3104-team09-2023/checkpoints/inference'

# List all GIF files from subdirectories
gif_files = []
for root, dirs, files in os.walk(folder_path):
    if root != folder_path:  # Exclude the "inference" folder itself
        for file in files:
            if file.endswith('.gif'):
                gif_files.append(os.path.join(root, file))

# Function to get the full file path of the selected GIF
def get_selected_gif_path(selected_gif_name):
    for gif_path in gif_files:
        if os.path.basename(gif_path) == selected_gif_name:
            return gif_path

# Extract only the file names without the full path
gif_file_names = [os.path.basename(gif_path) for gif_path in gif_files]

# Create a dropdown widget for GIF image selection
dropdown = widgets.Dropdown(
    options=gif_file_names,
    description='Output:',
    disabled=False,
)

# Output area to display the GIF image
output_area = widgets.Output()

# Display the dropdown
def on_dropdown_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        with output_area:
            display_gif(change['new'])

dropdown.observe(on_dropdown_change)
display(widgets.VBox([dropdown, output_area]))

# Training Section

**Steps to training a model**
1. After loading the cell, enter all the config information for training the model and select a video
2. Press on the Save Config button

The config file will be saved under /content/ict3104-team09-2023/configs/

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display
from omegaconf import OmegaConf

%cd /content/ict3104-team09-2023/

# Model path of the initial model
MODEL_PATH = '/content/ict3104-team09-2023/configs'

# Path to the folder containing your videos
data_path = '/content/ict3104-team09-2023/Data/'

# List all folders in the path
folders = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]

# Create widgets for train config parameters
config_name_widget = widgets.Text(value="filename.yaml", description="Config Name:")
train_video_path_widget = widgets.Dropdown(options=folders, description="Train Video Path:")
train_prompt_widget = widgets.Text(value="a man is skiing", description="Train Prompt:")
video_length_widget = widgets.IntText(value=8, description="Video Length(sec):")
max_train_steps_widget = widgets.IntText(value=50, description="Max Training Step:")
checkpointing_widget = widgets.IntText(value=25, description="Save Checkpoint every ? Step:")
output_widget = widgets.Text(value="output/model-1", description="Output Path:")

# Function to save the configuration
def save_config(button):
    output_path = output_widget.value
    config = {
        "pretrained_model_path": "./checkpoints/stable-diffusion-v1-4",
        "output_dir": output_path,
        "train_data": {
            "video_path": os.path.join(data_path, train_video_path_widget.value),
            "prompt": train_prompt_widget.value,
            "n_sample_frames": 4,
            "width": 512,
            "height": 512,
            "sample_start_idx": 0,
            "sample_frame_rate": 2,
            "dataset_set": "train"
        },
        "validation_data": {
            "prompts": [
                "A Iron man on the beach",
                "A Spider man on the snow",
                "A Superman on the street",
                "A boy on the forest"
            ],
            "video_length": video_length_widget.value,
            "width": 256,
            "height": 256,
            "num_inference_steps": 50,
            "guidance_scale": 12.5,
            "use_inv_latent": False,
            "num_inv_steps": 50,
            "dataset_set": "val"
        },
        "learning_rate": 3e-5,
        "train_batch_size": 1,
        "max_train_steps": max_train_steps_widget.value,
        "checkpointing_steps":checkpointing_widget.value,
        "validation_steps": 100,
        "trainable_modules": [
            "attn1.to_q",
            "attn2.to_q",
            "attn_temp",
            "conv_temporal"
        ],
        "skeleton_path": './pose_example/vis_kun_pose2.mov',
        "seed": 33,
        "mixed_precision": "no",
        "use_8bit_adam": False,
        "gradient_checkpointing": False,
        "enable_xformers_memory_efficient_attention": True #True
    }

    # Get the full file path by joining the MODEL_PATH and the YAML file name
    file_path = os.path.join(MODEL_PATH, config_name_widget.value)

    with open(file_path, 'w') as config_file:
        config_file.write(OmegaConf.to_yaml(config))

    print(f"Config saved to: {file_path}")

# Create a button to save the configuration
save_button = widgets.Button(description="Save Config")
save_button.on_click(save_config)

# Display widgets and save button
widgets_container = widgets.VBox([
    config_name_widget,
    output_widget,
    train_video_path_widget,
    train_prompt_widget,
    video_length_widget,
    max_train_steps_widget,
    checkpointing_widget,
    save_button
])

display(widgets_container)


In [None]:
! TORCH_DISTRIBUTED_DEBUG=DETAIL accelerate launch train_followyourpose.py --config="configs/test_train.yaml"