# Initialization

In [None]:
!pip install diffusers transformers accelerate torch

Collecting diffusers
  Downloading diffusers-0.29.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

# Woman Sora Example

In [None]:
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import numpy as np
import cv2

# Load the pipeline
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

# Generate video frames
prompt = "A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about."
video_frames = pipe(prompt, num_inference_steps=25).frames

# Check the shape of the first frame batch
print(video_frames[0].shape)

# Flatten the batch of frames into a single list of frames
flattened_frames = [frame for batch in video_frames for frame in batch]

# Ensure frames are in the correct format and convert to uint8
def convert_to_uint8(frames):
    converted_frames = []
    for frame in frames:
        frame = np.array(frame)
        if frame.dtype != np.uint8:
            frame = (frame * 255).astype(np.uint8)
        converted_frames.append(frame)
    return converted_frames

converted_frames = convert_to_uint8(flattened_frames)

# Export video using OpenCV
def export_to_video(frames, output_video_path="woman2.mp4", fps=30):
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    h, w, c = frames[0].shape
    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (w, h))

    for frame in frames:
        video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    video_writer.release()
    return output_video_path

video_path = export_to_video(converted_frames)
print(f"Video saved at {video_path}")


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

(16, 256, 256, 3)
Video saved at woman2.mp4


## Longer duration

In [None]:
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import numpy as np
import cv2

# Load pipeline
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

# Optimize for GPU memory
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()

# Generate
prompt = "A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about."
video_frames = pipe(prompt, num_inference_steps=5, num_frames=200).frames

# Check the shape of the first frame batch
print(video_frames[0].shape)

# Flatten the batch of frames into a single list of frames
flattened_frames = [frame for batch in video_frames for frame in batch]

# Ensure frames are in the correct format and convert to uint8
def convert_to_uint8(frames):
    converted_frames = []
    for frame in frames:
        frame = np.array(frame)
        if frame.dtype != np.uint8:
            frame = (frame * 255).astype(np.uint8)
        converted_frames.append(frame)
    return converted_frames

converted_frames = convert_to_uint8(flattened_frames)

# Export video using OpenCV
def export_to_video(frames, output_video_path="woman3.mp4", fps=20):
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    h, w, c = frames[0].shape
    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (w, h))

    for frame in frames:
        video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    video_writer.release()
    return output_video_path

video_path = export_to_video(converted_frames)
print(f"Video saved at {video_path}")


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

(200, 256, 256, 3)
Video saved at woman3.mp4


# Man reading a book on a cloud


In [None]:
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import numpy as np
import cv2

# Load the pipeline
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

# Generate video frames
prompt = "A young man in his 20s is sitting on a piece of cloud in the sky, reading a book. The sky is bright blue with fluffy white clouds, and the man is wearing a light blue shirt."
num_inference_steps = 50  # Increased number of steps
guidance_scale = 7.5  # Adjusted guidance scale
video_frames = pipe(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).frames

# Check the shape of the first frame batch
print(video_frames[0].shape)

# Flatten the batch of frames into a single list of frames
flattened_frames = [frame for batch in video_frames for frame in batch]

# Ensure frames are in the correct format and convert to uint8
def convert_to_uint8(frames):
    converted_frames = []
    for frame in frames:
        frame = np.array(frame)
        if frame.dtype != np.uint8:
            frame = (frame * 255).astype(np.uint8)
        converted_frames.append(frame)
    return converted_frames

converted_frames = convert_to_uint8(flattened_frames)

# Export video using OpenCV
def export_to_video(frames, output_video_path="Man11.mp4", fps=30):
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    h, w, c = frames[0].shape
    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (w, h))

    for frame in frames:
        video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    video_writer.release()
    return output_video_path

video_path = export_to_video(converted_frames)
print(f"Video saved at {video_path}")

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

(16, 256, 256, 3)
Video saved at Man11.mp4


## Longer duration

In [None]:
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import numpy as np
import cv2

# Load pipeline
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

# Optimize for GPU memory
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()

# Generate
prompt = "A young man in his 20s is sitting on a piece of cloud in the sky, reading a book."
num_inference_steps = 50  # Increased for better quality
num_frames = 200  # Generate 200 frames and then truncate to 100
video_frames = pipe(prompt, num_inference_steps=num_inference_steps, num_frames=num_frames).frames

# Check the shape of the first frame batch
print(video_frames[0].shape)

# Flatten the batch of frames into a single list of frames
flattened_frames = [frame for batch in video_frames for frame in batch]

# Take only the first 100 frames to ensure a 5-second video at 20 fps
flattened_frames = flattened_frames[:100]

# Ensure frames are in the correct format and convert to uint8
def convert_to_uint8(frames):
    converted_frames = []
    for frame in frames:
        frame = np.array(frame)
        if frame.dtype != np.uint8:
            frame = (frame * 255).astype(np.uint8)
        converted_frames.append(frame)
    return converted_frames

converted_frames = convert_to_uint8(flattened_frames)

# Export video using OpenCV
def export_to_video(frames, output_video_path="Man112.mp4", fps=20):
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    h, w, c = frames[0].shape
    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (w, h))

    for frame in frames:
        video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    video_writer.release()
    return output_video_path

video_path = export_to_video(converted_frames)
print(f"Video saved at {video_path}")


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

(200, 256, 256, 3)
Video saved at Man112.mp4
