In [None]:
from diffusers import DiffusionPipeline
import torch
import numpy as np
import imageio
from PIL import Image

# Load model (without fp16)
pipe = DiffusionPipeline.from_pretrained(
    "damo-vilab/text-to-video-ms-1.7b"
).to("cpu")  # Move to CPU explicitly

# Prompt and generation
prompt = "train swimming"
video_batches = pipe(prompt, num_inference_steps=50).frames

# Super resolution target
target_resolution = (768, 768)

# Process frames
processed_frames = []

for batch in video_batches:
    for frame in batch:
        # Normalize to 0-255 and convert
        if frame.dtype != np.uint8:
            frame = (frame * 255).clip(0, 255).astype(np.uint8)

        # Ensure RGB 3-channel
        if frame.ndim == 2:
            frame = np.stack([frame] * 3, axis=-1)
        elif frame.ndim == 3 and frame.shape[2] == 1:
            frame = np.repeat(frame, 3, axis=2)
        elif frame.ndim == 3 and frame.shape[2] > 3:
            frame = frame[:, :, :3]

        # Resize to higher resolution
        image = Image.fromarray(frame)
        image = image.resize(target_resolution, Image.LANCZOS)
        processed_frames.append(np.array(image))

print(f"Total processed frames: {len(processed_frames)}")

# Save video with FFmpeg CRF (better than fixed bitrate)
output_path = "ultra_hd_video.mp4"
writer = imageio.get_writer(
    output_path,
    fps=8,
    codec='libx264',
    quality=None,        # disable imageio's quality
    ffmpeg_params=[
        "-crf", "17",     # visually lossless
        "-preset", "slow",
        "-pix_fmt", "yuv420p"  # ensures compatibility
    ]
)

for frame in processed_frames:
    writer.append_data(frame)
writer.close()

print(f"Ultra-HD video saved at: {output_path}")

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Total processed frames: 16
Ultra-HD video saved at: ultra_hd_video.mp4
