In [None]:
!pip install diffusers transformers accelerate torch opencv-python

import torch
import numpy as np
import cv2
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

# Load the pipeline
pipe = DiffusionPipeline.from_pretrained(
    "damo-vilab/text-to-video-ms-1.7b",
    torch_dtype=torch.float16,
    variant="fp16"
)

pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

# Generate video frames
prompt = "Spiderman is surfing"
result = pipe(prompt, num_inference_steps=25)
video_frames_batches = result.frames  # This is a list of batches

# Flatten all frames from batches into a single list
all_frames = []
for batch in video_frames_batches:
    # batch shape: (16, 256, 256, 3)
    for frame in batch:
        all_frames.append(frame)

# Check frame format and convert if needed
corrected_frames = []
for frame in all_frames:
    if isinstance(frame, torch.Tensor):
        frame = frame.cpu().numpy()
    if frame.ndim == 2:  # Grayscale, expand to 3 channels
        frame = np.stack([frame]*3, axis=-1)
    elif frame.ndim == 3 and frame.shape[2] == 1:  # Single channel
        frame = np.concatenate([frame]*3, axis=2)
    elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
        pass  # Already correct format
    else:
        raise ValueError(f"Unsupported frame shape: {frame.shape}")
    # Convert from float [0,1] to uint8 if needed
    if frame.dtype != np.uint8:
        frame = (frame * 255).clip(0, 255).astype(np.uint8)
    corrected_frames.append(frame)

# Define video parameters
height, width, layers = corrected_frames[0].shape
video_filename = 'output_video.mp4'
fps = 20

# Create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))

# Write frames
for frame in corrected_frames:
    if frame.shape[2] == 4:  # RGBA -> RGB
        frame = frame[:, :, :3]
    video_writer.write(frame)

video_writer.release()

print(f"Video saved as {video_filename}")




Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


  0%|          | 0/25 [00:00<?, ?it/s]

Video saved as output_video.mp4


In [None]:
import torch
import numpy as np
import cv2
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

# Load the pipeline
pipe = DiffusionPipeline.from_pretrained(
    "damo-vilab/text-to-video-ms-1.7b",
    torch_dtype=torch.float16,
    variant="fp16"
)

pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

# List of prompts
prompts = [
    "A majestic waterfall in the mountains"
]

all_frames = []
for prompt in prompts:
    print(f"Generating video for prompt: {prompt}")
    result = pipe(prompt, num_inference_steps=30)  # Increase for longer video per prompt
    video_frames_batches = result.frames
    for batch in video_frames_batches:
        for frame in batch:
            all_frames.append(frame)

print(f"Total frames generated: {len(all_frames)}")

# Process frames
corrected_frames = []
for frame in all_frames:
    if isinstance(frame, torch.Tensor):
        frame = frame.cpu().numpy()
    if frame.ndim == 2:
        frame = np.stack([frame]*3, axis=-1)
    elif frame.ndim == 3 and frame.shape[2] == 1:
        frame = np.concatenate([frame]*3, axis=2)
    elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
        pass
    else:
        raise ValueError(f"Unsupported frame shape: {frame.shape}")
    if frame.dtype != np.uint8:
        frame = (frame * 255).clip(0, 255).astype(np.uint8)
    corrected_frames.append(frame)

# Create video
height, width, layers = corrected_frames[0].shape
video_filename = 'extended_output_video.mp4'
fps = 20

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))

for frame in corrected_frames:
    if frame.shape[2] == 4:
        frame = frame[:, :, :3]
    video_writer.write(frame)

video_writer.release()
print(f"Extended video saved as {video_filename}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


Generating video for prompt: A majestic waterfall in the mountains


  0%|          | 0/30 [00:00<?, ?it/s]

Total frames generated: 16
Extended video saved as extended_output_video.mp4


In [None]:
import torch
import numpy as np
import cv2
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

# Load the pipeline
pipe = DiffusionPipeline.from_pretrained(
    "damo-vilab/text-to-video-ms-1.7b",
    torch_dtype=torch.float16,
    variant="fp16"
)

pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

# List of prompts
prompts = [
    "Spiderman is surfing",
    "A spaceship flying over a city",
    "A majestic waterfall in the mountains"
]

all_frames = []
for prompt in prompts:
    print(f"Generating video for prompt: {prompt}")
    result = pipe(prompt, num_inference_steps=40)  # Increase steps to get more frames
    video_frames_batches = result.frames
    for batch in video_frames_batches:
        for frame in batch:
            all_frames.append(frame)

total_frames = len(all_frames)
print(f"Total frames generated: {total_frames}")

# Decide fps to ensure at least 10 seconds duration
target_duration_sec = 10
fps = max(5, total_frames // target_duration_sec)  # Ensure at least 10 sec

print(f"Setting fps = {fps} for at least {target_duration_sec} seconds duration")

# Process frames
corrected_frames = []
for frame in all_frames:
    if isinstance(frame, torch.Tensor):
        frame = frame.cpu().numpy()
    if frame.ndim == 2:
        frame = np.stack([frame]*3, axis=-1)
    elif frame.ndim == 3 and frame.shape[2] == 1:
        frame = np.concatenate([frame]*3, axis=2)
    elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
        pass
    else:
        raise ValueError(f"Unsupported frame shape: {frame.shape}")
    if frame.dtype != np.uint8:
        frame = (frame * 255).clip(0, 255).astype(np.uint8)
    corrected_frames.append(frame)

# Create video
height, width, layers = corrected_frames[0].shape
video_filename = 'extended_output_video_10sec.mp4'

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))

for frame in corrected_frames:
    if frame.shape[2] == 4:
        frame = frame[:, :, :3]
    video_writer.write(frame)

video_writer.release()
print(f"Video saved as {video_filename}")


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


Generating video for prompt: Spiderman is surfing


  0%|          | 0/40 [00:00<?, ?it/s]

Generating video for prompt: A spaceship flying over a city


  0%|          | 0/40 [00:00<?, ?it/s]

Generating video for prompt: A majestic waterfall in the mountains


  0%|          | 0/40 [00:00<?, ?it/s]

Total frames generated: 48
Setting fps = 5 for at least 10 seconds duration
Video saved as extended_output_video_10sec.mp4


In [None]:
!pip install diffusers transformers accelerate torch opencv-python

import torch
import numpy as np
import cv2
import os
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

# Directory to save videos
os.makedirs("generated_videos", exist_ok=True)

# Choose two different models from Hugging Face
models = [
    #"damo-vilab/text-to-video-ms-1.7b",      # Model 1
    "cerspense/zeroscope_v2_576w"  # Model 2 (different)
]

# Prompts related to student/college
prompts = [
    "A group of students studying together in a library",
    #"Students walking across a modern college campus on a sunny day",
    #"A teacher explaining a concept in a classroom with a smart board",
    #"College students playing football on the ground",
    #"A graduation ceremony with students throwing caps in the air"
]

# Target video duration
target_duration_sec = 10

video_counter = 1

for model_id in models:
    print(f"\nLoading model: {model_id}")
    if "zeroscope" in model_id:
        # zeroscope has no fp16 variant
        pipe = DiffusionPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.float16   # still okay to run in half precision
        )
    else:
        # models like damo-vilab support fp16 variant
        pipe = DiffusionPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            variant="fp16"
        )

    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.enable_model_cpu_offload()


    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.enable_model_cpu_offload()

    # Assign half the prompts to each model
    if model_id == models[0]:
        model_prompts = prompts[:3]   # First 3 prompts with Model 1
    else:
        model_prompts = prompts[3:]   # Remaining 2 prompts with Model 2

    for prompt in model_prompts:
        print(f"Generating video {video_counter} for prompt: {prompt}")

        result = pipe(prompt, num_inference_steps=40)
        video_frames_batches = result.frames

        all_frames = []
        for batch in video_frames_batches:
            for frame in batch:
                all_frames.append(frame)

        total_frames = len(all_frames)
        fps = max(5, total_frames // target_duration_sec)

        corrected_frames = []
        for frame in all_frames:
            if isinstance(frame, torch.Tensor):
                frame = frame.cpu().numpy()
            if frame.ndim == 2:
                frame = np.stack([frame]*3, axis=-1)
            elif frame.ndim == 3 and frame.shape[2] == 1:
                frame = np.concatenate([frame]*3, axis=2)
            elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
                pass
            else:
                raise ValueError(f"Unsupported frame shape: {frame.shape}")
            if frame.dtype != np.uint8:
                frame = (frame * 255).clip(0, 255).astype(np.uint8)
            corrected_frames.append(frame)

        # Save video
        height, width, layers = corrected_frames[0].shape
        video_filename = f"generated_videos/video_{video_counter}.mp4"
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))

        for frame in corrected_frames:
            if frame.shape[2] == 4:
                frame = frame[:, :, :3]
            video_writer.write(frame)

        video_writer.release()
        print(f"✅ Saved {video_filename}\n")

        video_counter += 1

print("🎥 All videos generated and stored in 'generated_videos/' folder")



Loading model: cerspense/zeroscope_v2_576w


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

text_encoder/pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.bin:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

An error occurred while trying to fetch /root/.cache/huggingface/hub/models--cerspense--zeroscope_v2_576w/snapshots/6963642a64dbefa93663d1ecebb4ceda2d9ecb28/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /root/.cache/huggingface/hub/models--cerspense--zeroscope_v2_576w/snapshots/6963642a64dbefa93663d1ecebb4ceda2d9ecb28/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
An error occurred while trying to fetch /root/.cache/huggingface/hub/models--cerspense--zeroscope_v2_576w/snapshots/6963642a64dbefa93663d1ecebb4ceda2d9ecb28/vae: Error no file named diffusion_pytorch_model.safetensors found in directory /root/.cache/huggingface/hub/models--cerspense--zeroscope_v2_576w/snapshots/6963642a64dbefa93663d1ecebb4ceda2d9ecb28/vae.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after 

Generating video 1 for prompt: A group of students studying together in a library


  0%|          | 0/40 [00:00<?, ?it/s]

✅ Saved generated_videos/video_1.mp4

🎥 All videos generated and stored in 'generated_videos/' folder


In [None]:
import torch
import numpy as np
import cv2
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

# Load the pipeline
pipe = DiffusionPipeline.from_pretrained(
    "damo-vilab/text-to-video-ms-1.7b",
    torch_dtype=torch.float16,
    variant="fp16"
)

pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

# List of prompts
prompts = [
    "Early school kids playing football during break time",
    "A person scrolling on a phone in cafe with coffee",
    "Old lady watering garden grass in the courtyard",
    "A playful puppy running across a grassy field, chasing butterflies.",
    "A 3D animation showing how the human heart pumps blood through the body."
]

all_frames = []
for prompt in prompts:
    print(f"Generating video for prompt: {prompt}")
    result = pipe(prompt, num_inference_steps=40)  # Increase steps to get more frames
    video_frames_batches = result.frames
    for batch in video_frames_batches:
        for frame in batch:
            all_frames.append(frame)

total_frames = len(all_frames)
print(f"Total frames generated: {total_frames}")

# Decide fps to ensure at least 10 seconds duration
target_duration_sec = 10
fps = max(5, total_frames // target_duration_sec)  # Ensure at least 10 sec

print(f"Setting fps = {fps} for at least {target_duration_sec} seconds duration")

# Process frames
corrected_frames = []
for frame in all_frames:
    if isinstance(frame, torch.Tensor):
        frame = frame.cpu().numpy()
    if frame.ndim == 2:
        frame = np.stack([frame]*3, axis=-1)
    elif frame.ndim == 3 and frame.shape[2] == 1:
        frame = np.concatenate([frame]*3, axis=2)
    elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
        pass
    else:
        raise ValueError(f"Unsupported frame shape: {frame.shape}")
    if frame.dtype != np.uint8:
        frame = (frame * 255).clip(0, 255).astype(np.uint8)
    corrected_frames.append(frame)

# Create video
height, width, layers = corrected_frames[0].shape
video_filename = 'extended_output_video_10sec.mp4'

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (width, height))

for frame in corrected_frames:
    if frame.shape[2] == 4:
        frame = frame[:, :, :3]
    video_writer.write(frame)

video_writer.release()
print(f"Video saved as {video_filename}")


In [6]:
import torch
import numpy as np
import cv2
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import ipywidgets as widgets
from IPython.display import display, HTML, Video
import os
from google.colab import files
import threading
import time

class TextToVideoGUI:
    def __init__(self):
        self.pipe = None
        self.is_model_loaded = False
        self.is_generating = False
        self.setup_gui()

    def setup_gui(self):
        """Setup the GUI components"""
        # Title
        self.title = widgets.HTML("<h2>🎬 Text-to-Video Generator</h2>")

        # Model loading section
        self.load_button = widgets.Button(
            description="Load Model",
            button_style='info',
            icon='download'
        )
        self.load_status = widgets.HTML("Model not loaded")

        # Prompt input
        self.prompt_text = widgets.Textarea(
            placeholder="Enter your video prompt here...\nExample: A playful puppy running across a grassy field",
            description="Prompt:",
            layout=widgets.Layout(width='100%', height='100px')
        )

        # Multiple prompts section
        self.multi_prompt_text = widgets.Textarea(
            placeholder="Enter multiple prompts (one per line):\nEarly school kids playing football\nA person scrolling on phone in cafe\nOld lady watering garden",
            description="Multiple Prompts:",
            layout=widgets.Layout(width='100%', height='120px')
        )

        # Settings
        self.inference_steps = widgets.IntSlider(
            value=25,
            min=10,
            max=50,
            description="Inference Steps:",
            style={'description_width': 'initial'}
        )

        self.target_duration = widgets.IntSlider(
            value=10,
            min=5,
            max=30,
            description="Target Duration (sec):",
            style={'description_width': 'initial'}
        )

        # Generation controls
        self.generate_single_btn = widgets.Button(
            description="Generate Single Video",
            button_style='success',
            icon='play',
            disabled=True
        )

        self.generate_multi_btn = widgets.Button(
            description="Generate Multi-Prompt Video",
            button_style='success',
            icon='film',
            disabled=True
        )

        # Progress and status
        self.progress = widgets.IntProgress(
            value=0,
            min=0,
            max=100,
            description='Progress:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='100%')
        )

        self.status_text = widgets.HTML("Ready to load model")

        # Output section
        self.output_area = widgets.Output()

        # Download section
        self.download_btn = widgets.Button(
            description="Download Video",
            button_style='warning',
            icon='download',
            disabled=True
        )

        self.current_video_path = None

        # Bind events
        self.load_button.on_click(self.load_model)
        self.generate_single_btn.on_click(self.generate_single_video)
        self.generate_multi_btn.on_click(self.generate_multi_video)
        self.download_btn.on_click(self.download_video)

    def display_gui(self):
        """Display the complete GUI"""
        model_section = widgets.VBox([
            widgets.HTML("<h3>📥 Model Loading</h3>"),
            widgets.HBox([self.load_button, self.load_status]),
        ])

        prompt_section = widgets.VBox([
            widgets.HTML("<h3>✏️ Prompts</h3>"),
            self.prompt_text,
            self.multi_prompt_text,
        ])

        settings_section = widgets.VBox([
            widgets.HTML("<h3>⚙️ Settings</h3>"),
            self.inference_steps,
            self.target_duration,
        ])

        generation_section = widgets.VBox([
            widgets.HTML("<h3>🎬 Generation</h3>"),
            widgets.HBox([self.generate_single_btn, self.generate_multi_btn]),
            self.progress,
            self.status_text,
        ])

        output_section = widgets.VBox([
            widgets.HTML("<h3>📺 Output</h3>"),
            self.output_area,
            self.download_btn,
        ])

        main_gui = widgets.VBox([
            self.title,
            model_section,
            prompt_section,
            settings_section,
            generation_section,
            output_section,
        ])

        display(main_gui)

    def load_model(self, b):
        """Load the diffusion model"""
        if self.is_model_loaded:
            self.status_text.value = "Model already loaded!"
            return

        self.load_button.disabled = True
        self.load_button.description = "Loading..."
        self.load_status.value = "Loading model... This may take a few minutes."

        try:
            # Load the pipeline
            self.pipe = DiffusionPipeline.from_pretrained(
                "damo-vilab/text-to-video-ms-1.7b",
                torch_dtype=torch.float16,
                variant="fp16"
            )

            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
            self.pipe.enable_model_cpu_offload()

            self.is_model_loaded = True
            self.load_status.value = "✅ Model loaded successfully!"
            self.load_button.description = "Model Loaded"
            self.load_button.button_style = 'success'

            # Enable generation buttons
            self.generate_single_btn.disabled = False
            self.generate_multi_btn.disabled = False
            self.status_text.value = "Model ready! Enter your prompt and generate video."

        except Exception as e:
            self.load_status.value = f"❌ Error loading model: {str(e)}"
            self.load_button.disabled = False
            self.load_button.description = "Load Model"
            self.status_text.value = "Failed to load model. Please try again."

    def process_frames(self, all_frames):
        """Process and correct video frames"""
        corrected_frames = []
        for frame in all_frames:
            if isinstance(frame, torch.Tensor):
                frame = frame.cpu().numpy()
            if frame.ndim == 2:
                frame = np.stack([frame]*3, axis=-1)
            elif frame.ndim == 3 and frame.shape[2] == 1:
                frame = np.concatenate([frame]*3, axis=2)
            elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
                pass
            else:
                raise ValueError(f"Unsupported frame shape: {frame.shape}")
            if frame.dtype != np.uint8:
                frame = (frame * 255).clip(0, 255).astype(np.uint8)
            corrected_frames.append(frame)
        return corrected_frames

    def create_video(self, frames, filename):
        """Create video from frames"""
        if not frames:
            raise ValueError("No frames to create video")

        # Calculate FPS for target duration
        total_frames = len(frames)
        target_duration_sec = self.target_duration.value
        fps = max(5, total_frames // target_duration_sec)

        height, width = frames[0].shape[:2]

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))

        for frame in frames:
            if frame.shape[2] == 4:
                frame = frame[:, :, :3]
            # Convert RGB to BGR for OpenCV
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            video_writer.write(frame_bgr)

        video_writer.release()
        return fps

    def generate_single_video(self, b):
        """Generate video from single prompt"""
        if not self.is_model_loaded:
            self.status_text.value = "Please load the model first!"
            return

        prompt = self.prompt_text.value.strip()
        if not prompt:
            self.status_text.value = "Please enter a prompt!"
            return

        self.run_generation([prompt], "single_video.mp4")

    def generate_multi_video(self, b):
        """Generate video from multiple prompts"""
        if not self.is_model_loaded:
            self.status_text.value = "Please load the model first!"
            return

        prompts_text = self.multi_prompt_text.value.strip()
        if not prompts_text:
            self.status_text.value = "Please enter prompts!"
            return

        prompts = [p.strip() for p in prompts_text.split('\n') if p.strip()]
        if not prompts:
            self.status_text.value = "Please enter valid prompts!"
            return

        self.run_generation(prompts, "multi_video.mp4")

    def run_generation(self, prompts, filename):
        """Run video generation in a separate thread"""
        if self.is_generating:
            self.status_text.value = "Generation already in progress!"
            return

        # Disable buttons during generation
        self.generate_single_btn.disabled = True
        self.generate_multi_btn.disabled = True
        self.download_btn.disabled = True
        self.is_generating = True

        # Clear previous output
        with self.output_area:
            self.output_area.clear_output()

        # Start generation thread
        thread = threading.Thread(target=self._generate_video_thread, args=(prompts, filename))
        thread.start()

    def _generate_video_thread(self, prompts, filename):
        """Generate video in separate thread"""
        try:
            all_frames = []
            total_prompts = len(prompts)

            for i, prompt in enumerate(prompts):
                self.status_text.value = f"Generating video for prompt {i+1}/{total_prompts}: {prompt[:50]}..."
                self.progress.value = int((i / total_prompts) * 80)  # Up to 80% for generation

                result = self.pipe(prompt, num_inference_steps=self.inference_steps.value)
                video_frames_batches = result.frames

                for batch in video_frames_batches:
                    for frame in batch:
                        all_frames.append(frame)

            self.status_text.value = "Processing frames and creating video..."
            self.progress.value = 90

            # Process frames
            corrected_frames = self.process_frames(all_frames)

            # Create video
            fps = self.create_video(corrected_frames, filename)

            self.progress.value = 100
            self.status_text.value = f"✅ Video generated successfully! ({len(corrected_frames)} frames, {fps} FPS)"
            self.current_video_path = filename

            # Display video
            with self.output_area:
                self.output_area.clear_output()
                if os.path.exists(filename):
                    print(f"Video saved as: {filename}")
                    print(f"Total frames: {len(corrected_frames)}")
                    print(f"FPS: {fps}")
                    print(f"Duration: ~{len(corrected_frames)/fps:.1f} seconds")

                    # Try to display video preview
                    try:
                        display(Video(filename, width=400, height=300))
                    except:
                        print("Video preview not available, but file was created successfully.")

            # Enable download button
            self.download_btn.disabled = False

        except Exception as e:
            self.status_text.value = f"❌ Error: {str(e)}"
            with self.output_area:
                self.output_area.clear_output()
                print(f"Error during generation: {str(e)}")

        finally:
            # Re-enable buttons
            self.generate_single_btn.disabled = False
            self.generate_multi_btn.disabled = False
            self.is_generating = False
            self.progress.value = 0

    def download_video(self, b):
        """Download the generated video"""
        if not self.current_video_path or not os.path.exists(self.current_video_path):
            self.status_text.value = "No video to download!"
            return

        try:
            files.download(self.current_video_path)
            self.status_text.value = "✅ Video download started!"
        except Exception as e:
            self.status_text.value = f"❌ Download error: {str(e)}"

# Create and display the GUI
def create_video_gui():
    """Create and display the video generation GUI"""
    gui = TextToVideoGUI()
    gui.display_gui()
    return gui

# Usage instructions
print("🎬 Text-to-Video Generator GUI")
print("=" * 50)
print("Instructions:")
print("1. Run: gui = create_video_gui()")
print("2. Click 'Load Model' and wait for it to complete")
print("3. Enter your prompt(s)")
print("4. Adjust settings if needed")
print("5. Click generate button")
print("6. Download your video when ready!")
print("=" * 50)

# Uncomment the line below to automatically create the GUI
# gui = create_video_gui()

🎬 Text-to-Video Generator GUI
Instructions:
1. Run: gui = create_video_gui()
2. Click 'Load Model' and wait for it to complete
3. Enter your prompt(s)
4. Adjust settings if needed
5. Click generate button
6. Download your video when ready!


In [7]:
gui = create_video_gui()

VBox(children=(HTML(value='<h2>🎬 Text-to-Video Generator</h2>'), VBox(children=(HTML(value='<h3>📥 Model Loadin…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/2.82G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


In [14]:
import torch
import numpy as np
import cv2
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import gradio as gr
import os
import tempfile
from typing import List, Optional
import threading
import time

class TextToVideoGenerator:
    def __init__(self):
        self.pipe = None
        self.is_model_loaded = False

    def load_model(self):
        """Load the diffusion model"""
        if self.is_model_loaded:
            return "✅ Model already loaded!"

        try:
            # Load the pipeline
            self.pipe = DiffusionPipeline.from_pretrained(
                "damo-vilab/text-to-video-ms-1.7b",
                torch_dtype=torch.float16,
                variant="fp16"
            )

            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
            self.pipe.enable_model_cpu_offload()

            self.is_model_loaded = True
            return "✅ Model loaded successfully! You can now generate videos."

        except Exception as e:
            return f"❌ Error loading model: {str(e)}"

    def process_frames(self, all_frames):
        """Process and correct video frames"""
        corrected_frames = []
        for frame in all_frames:
            if isinstance(frame, torch.Tensor):
                frame = frame.cpu().numpy()
            if frame.ndim == 2:
                frame = np.stack([frame]*3, axis=-1)
            elif frame.ndim == 3 and frame.shape[2] == 1:
                frame = np.concatenate([frame]*3, axis=2)
            elif frame.ndim == 3 and frame.shape[2] in [3, 4]:
                pass
            else:
                raise ValueError(f"Unsupported frame shape: {frame.shape}")
            if frame.dtype != np.uint8:
                frame = (frame * 255).clip(0, 255).astype(np.uint8)
            corrected_frames.append(frame)
        return corrected_frames

    def create_video(self, frames, target_duration_sec=10):
        """Create video from frames and return path"""
        if not frames:
            raise ValueError("No frames to create video")

        # Calculate FPS for target duration
        total_frames = len(frames)
        fps = max(5, total_frames // target_duration_sec)

        height, width = frames[0].shape[:2]

        # Create temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
        filename = temp_file.name
        temp_file.close()

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))

        for frame in frames:
            if frame.shape[2] == 4:
                frame = frame[:, :, :3]
            # Convert RGB to BGR for OpenCV
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            video_writer.write(frame_bgr)

        video_writer.release()
        return filename, fps, len(frames)

    def generate_video(self, prompt: str, inference_steps: int = 25, target_duration: int = 10, progress=gr.Progress()):
        """Generate video from single prompt"""
        if not self.is_model_loaded:
            return None, "❌ Please load the model first!", None

        if not prompt.strip():
            return None, "❌ Please enter a prompt!", None

        try:
            progress(0.1, desc="Starting generation...")

            # Generate video
            progress(0.3, desc=f"Generating video for: {prompt[:50]}...")
            result = self.pipe(prompt, num_inference_steps=inference_steps)

            progress(0.7, desc="Processing frames...")

            # Collect all frames
            all_frames = []
            video_frames_batches = result.frames
            for batch in video_frames_batches:
                for frame in batch:
                    all_frames.append(frame)

            # Process frames
            corrected_frames = self.process_frames(all_frames)

            progress(0.9, desc="Creating video file...")

            # Create video
            video_path, fps, total_frames = self.create_video(corrected_frames, target_duration)

            progress(1.0, desc="Complete!")

            status_msg = f"✅ Video generated successfully!\n📊 Stats: {total_frames} frames, {fps} FPS, ~{total_frames/fps:.1f}s duration"

            return video_path, status_msg, video_path

        except Exception as e:
            return None, f"❌ Error during generation: {str(e)}", None

    def generate_multi_video(self, prompts_text: str, inference_steps: int = 25, target_duration: int = 10, progress=gr.Progress()):
        """Generate video from multiple prompts"""
        if not self.is_model_loaded:
            return None, "❌ Please load the model first!", None

        if not prompts_text.strip():
            return None, "❌ Please enter prompts!", None

        try:
            # Parse prompts
            prompts = [p.strip() for p in prompts_text.split('\n') if p.strip()]
            if not prompts:
                return None, "❌ Please enter valid prompts!", None

            progress(0.1, desc="Starting multi-prompt generation...")

            all_frames = []
            total_prompts = len(prompts)

            for i, prompt in enumerate(prompts):
                progress_val = 0.1 + (i / total_prompts) * 0.7
                progress(progress_val, desc=f"Generating {i+1}/{total_prompts}: {prompt[:30]}...")

                result = self.pipe(prompt, num_inference_steps=inference_steps)
                video_frames_batches = result.frames

                for batch in video_frames_batches:
                    for frame in batch:
                        all_frames.append(frame)

            progress(0.8, desc="Processing all frames...")

            # Process frames
            corrected_frames = self.process_frames(all_frames)

            progress(0.95, desc="Creating final video...")

            # Create video
            video_path, fps, total_frames = self.create_video(corrected_frames, target_duration)

            progress(1.0, desc="Complete!")

            status_msg = f"✅ Multi-prompt video generated!\n📊 Stats: {len(prompts)} prompts, {total_frames} frames, {fps} FPS, ~{total_frames/fps:.1f}s duration"

            return video_path, status_msg, video_path

        except Exception as e:
            return None, f"❌ Error during generation: {str(e)}", None

# Initialize the generator
generator = TextToVideoGenerator()

# Custom CSS for better styling
css = """
.main-header {
    color: white;
}

.section-header {
    color: white;
}

.status-box {
    padding: 10px;
    border-radius: 5px;
    margin: 10px 0;
    font-family: monospace;
}

.success {
    background-color: #d4edda;
    border: 1px solid #c3e6cb;
    color: #155724;
}

.error {
    background-color: #f8d7da;
    border: 1px solid #f5c6cb;
    color: #721c24;
}

.video-container {
    border: 2px solid #667eea;
    border-radius: 10px;
    padding: 10px;
    margin: 10px 0;
}
"""

def create_interface():
    with gr.Blocks(css=css, title="Text-to-Video Generator") as demo:
        gr.HTML("""
        <div class="main-header">
            <h1>Text-to-Video Generator</h1>
        </div>
        """)

        # Model loading section
        with gr.Row():
            with gr.Column():
                gr.HTML('<div class="section-header"><h3>📥 Model Management</h3></div>')
                load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
                load_status = gr.Textbox(
                    label="Model Status",
                    value="Model not loaded. Click 'Load Model' to start.",
                    interactive=False
                )

        load_btn.click(
            fn=generator.load_model,
            outputs=[load_status]
        )

        # Main generation interface
        with gr.Tabs():
            # Single prompt tab
            with gr.TabItem("🎯 Single Prompt"):
                gr.HTML('<div class="section-header"><h3>Generate from Single Prompt</h3></div>')

                with gr.Row():
                    with gr.Column(scale=2):
                        single_prompt = gr.Textbox(
                            label="Enter Your Prompt",
                            placeholder="Example: A playful puppy running across a grassy field, chasing butterflies",
                            lines=3
                        )

                        with gr.Row():
                            single_steps = gr.Slider(10, 50, value=25, step=1, label="Inference Steps")
                            single_duration = gr.Slider(5, 30, value=10, step=1, label="Target Duration (seconds)")

                        single_generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")

                    with gr.Column(scale=1):
                        single_status = gr.Textbox(
                            label="Generation Status",
                            value="Ready to generate",
                            interactive=False,
                            lines=4
                        )

                with gr.Row():
                    single_video_output = gr.Video(label="Generated Video", elem_classes="video-container")
                    single_download = gr.File(label="Download Video", visible=False)

                single_generate_btn.click(
                    fn=generator.generate_video,
                    inputs=[single_prompt, single_steps, single_duration],
                    outputs=[single_video_output, single_status, single_download]
                )

            # Multiple prompts tab
            with gr.TabItem("🎭 Multiple Prompts"):
                gr.HTML('<div class="section-header"><h3>Generate Extended Video from Multiple Prompts</h3></div>')

                with gr.Row():
                    with gr.Column(scale=2):
                        multi_prompts = gr.Textbox(
                            label="Enter Multiple Prompts (one per line)",
                            placeholder="Early school kids playing football during break time\nA person scrolling on a phone in cafe with coffee\nOld lady watering garden grass in the courtyard\nA playful puppy running across a grassy field",
                            lines=6
                        )

                        with gr.Row():
                            multi_steps = gr.Slider(10, 50, value=25, step=1, label="Inference Steps")
                            multi_duration = gr.Slider(5, 30, value=10, step=1, label="Target Duration (seconds)")

                        multi_generate_btn = gr.Button("🎬 Generate Extended Video", variant="primary", size="lg")

                    with gr.Column(scale=1):
                        multi_status = gr.Textbox(
                            label="Generation Status",
                            value="Ready to generate",
                            interactive=False,
                            lines=6
                        )

                with gr.Row():
                    multi_video_output = gr.Video(label="Generated Video", elem_classes="video-container")
                    multi_download = gr.File(label="Download Video", visible=False)

                multi_generate_btn.click(
                    fn=generator.generate_multi_video,
                    inputs=[multi_prompts, multi_steps, multi_duration],
                    outputs=[multi_video_output, multi_status, multi_download]
                )


    return demo

# Function to launch the web interface
def launch_web_gui(share=True, debug=False):
    """
    Launch the web-based GUI

    Args:
        share (bool): If True, creates a public URL that can be shared
        debug (bool): If True, enables debug mode
    """
    print("🚀 Starting Text-to-Video Web Interface...")
    print("=" * 50)

    demo = create_interface()

    # Launch with public URL for Colab
    demo.launch(
        share=share,  # Creates public URL
        debug=debug,
        height=800,
        show_error=True,
        quiet=False
    )

    return demo

# Instructions for Colab users
print("🎬 Web-based Text-to-Video Generator")
print("=" * 50)
print("Setup Instructions for Google Colab:")
print("1. Install dependencies:")
print("   !pip install gradio diffusers transformers accelerate torch")
print("   !pip install opencv-python-headless")
print("")
print("2. Run the web interface:")
print("   demo = launch_web_gui()")
print("")
print("3. The interface will provide:")
print("   - A local URL for Colab")
print("   - A public shareable URL")
print("   - Full web interface accessible from any device!")
print("=" * 50)

# Uncomment to auto-launch (remove the # below)
# demo = launch_web_gui()

🎬 Web-based Text-to-Video Generator
Setup Instructions for Google Colab:
1. Install dependencies:
   !pip install gradio diffusers transformers accelerate torch
   !pip install opencv-python-headless

2. Run the web interface:
   demo = launch_web_gui()

3. The interface will provide:
   - A local URL for Colab
   - A public shareable URL
   - Full web interface accessible from any device!


In [15]:
demo = launch_web_gui()

🚀 Starting Text-to-Video Web Interface...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1d92d6ab397981060f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
