# **WAN Fusion X IMAGE TO VIDEO WITH FAST Q3 GGUF MODEL**
- QuantStack/Wan2.1_I2V_14B_FusionX-GGUF
- The videos are generated at 16fps. You can use the `Frame Interpolation` notebook in this github repository (https://github.com/Isi-dev/Google-Colab_Notebooks) to increase it.

In [None]:
# @title Prepare Environment
import sys, os

!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q torchsde einops diffusers accelerate xformers
!pip install av

!apt -y install -qq aria2 ffmpeg

# 1. Clone the OFFICIAL ComfyUI repository for maximum stability
!git clone https://github.com/comfyanonymous/ComfyUI.git /content/ComfyUI
!pip install -r /content/ComfyUI/requirements.txt

# 2. Install the community-recommended GGUF custom node from city96
!git clone https://github.com/city96/ComfyUI-GGUF.git /content/ComfyUI/custom_nodes/ComfyUI_GGUF
!pip install -r /content/ComfyUI/custom_nodes/ComfyUI_GGUF/requirements.txt


model_name = "Wan2.1_T2V_14B_FusionX-Q3_K_L.gguf"
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/QuantStack/Wan2.1_T2V_14B_FusionX-GGUF/resolve/main/{model_name} -d /content/ComfyUI/models/unet -o {model_name}

encoder_name = "umt5_xxl_fp8_e4m3fn_scaled.safetensors"
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/{encoder_name} -d /content/ComfyUI/models/text_encoders -o {encoder_name}
#encoder_name = "umt5-xxl-encoder-Q8_0.gguf"
#!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/umt5-xxl-encoder-gguf/resolve/main/{encoder_name} -d /content/ComfyUI/models/text_encoders -o {encoder_name}

vae_name = "wan_2.1_vae.safetensors"
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/{vae_name} -d /content/ComfyUI/models/vae -o {vae_name}

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
sys.path.insert(0, '/content/ComfyUI')

print("✅ Environment Setup Complete!")

In [None]:
# @title LoRAs Config

drive.mount('/content/drive')
!mkdir -p /content/ComfyUI/models/loras

lora_file_root = "/content/drive/MyDrive/AI/LoRAs/Wan2.1"

loras_config = [
  {
    "filename": "sample-wan-lora.safetensors",
    "strength": 1.0
  }
]

for lora_config in loras_config:
  lora_filename = lora_config["filename"]
  !cp "{lora_file_root}/{lora_filename}" "/content/ComfyUI/models/loras/"

print("✅ LoRAs Setup Complete!")

In [None]:
# @title Logic
import torch
import numpy as np
from PIL import Image
import gc, random, pdb
import imageio
from IPython.display import display, Video as IPVideo, Image as IPImage

from comfy import model_management

from nodes import (
    CLIPTextEncode,
    VAEDecode,
    VAELoader,
    KSampler,
    LoraLoader
)

from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF, CLIPLoaderGGUF
from comfy_extras.nodes_model_advanced import ModelSamplingSD3
from comfy_extras.nodes_hunyuan import EmptyHunyuanLatentVideo

def clear_memory():

    model_management.unload_all_models()

    gc.collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    gc.collect()

def save_as_mp4(images, filename_prefix, fps, crf=16, preset='slow', output_dir="/content/ComfyUI/output"):
    """
    Save images as a high-quality H.265 MP4 video using imageio and ffmpeg.

    Args:
        images: A list of PyTorch tensors (frames).
        filename_prefix: The base name for the output file.
        fps: Frames per second for the video.
        crf: Constant Rate Factor for quality (lower is better, 16 is high quality).
        preset: Encoding speed vs. compression (e.g., 'ultrafast', 'medium', 'slow').
        output_dir: The directory to save the video in.
    """
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{filename_prefix}.mp4"

    # Convert tensors to NumPy arrays in the correct format
    frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]

    # Define the ffmpeg parameters for high-quality H.265
    writer_kwargs = {
        'format': 'FFMPEG',          # Explicitly use the FFMPEG backend
        'mode': 'I',                 # 'I' for a sequence of images
        'fps': int(fps),
        'codec': 'libx265',          # H.265 codec
        'output_params': [
            '-preset', str(preset),  # Encoding preset
            '-crf', str(int(crf)),   # Constant Rate Factor for quality
            '-pix_fmt', 'yuv420p'    # Pixel format for compatibility
        ]
    }

    print(f"Saving high-quality MP4 to: {output_path}")
    with imageio.get_writer(output_path, **writer_kwargs) as writer:
        for frame in frames:
            writer.append_data(frame)

    print("Video saved successfully.")
    return output_path

def save_as_image_sequence(images, filename_prefix, output_dir="/content/ComfyUI/output"):
    """Save sequence of frames as PNG image."""
    os.makedirs(output_dir, exist_ok=True)

    ret_array = []

    for idx, image in enumerate(images, start=1):
      output_path = f"{output_dir}/{filename_prefix}_{idx:04d}.png"
      ret_array.append(output_path)
      frame = (image.cpu().numpy() * 255).astype(np.uint8)
      Image.fromarray(frame).save(output_path)

    return ret_array

def save_as_image(image, filename_prefix, output_dir="/content/ComfyUI/output"):
    """Save single frame as PNG image."""
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{filename_prefix}.png"

    frame = (image.cpu().numpy() * 255).astype(np.uint8)

    Image.fromarray(frame).save(output_path)

    return output_path

def generate_video(
    positive_prompt: str = "a cute cat playing with a ball of yarn",
    negative_prompt: str = ("overexposed, blurred details, subtitles, "
       "desaturated, low contrast, washed out, dull colors, flat lighting, "
       "worst quality, low quality, JPEG compression artifacts, "
       "incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
       "deformed, disfigured, misshapen limbs, fused fingers, still picture, "
       "three legs, walking backwards, watermark, text, signature"),
    width: int = 832,
    height: int = 480,
    seed: int = 82628696717253,
    steps: int = 4,
    cfg_scale: float = 1.0,
    sampler_name: str = "uni_pc",
    scheduler: str = "simple",
    frames: int = 33,
    fps: int = 16,
    output_format: str = "mp4"
):

  with torch.inference_mode():

      print(f"Loading...{encoder_name}")
      text_encoder = CLIPLoader().load_clip(encoder_name, "wan")[0]
      print(f"Loading...{model_name}")
      model = UnetLoaderGGUF().load_unet(model_name)[0]

      for lora_config in loras_config:

        lora_name = lora_config.get("filename")
        strength = float(lora_config.get("strength", 1.0))

        print(f"Applying LoRA '{lora_name}'")
        patched_model, patched_clip = LoraLoader().load_lora(
            model = model,
            clip = text_encoder,
            lora_name = lora_name,
            strength_model = strength,
            strength_clip = strength
        )
        if strength > 0.0:
          #load_lora clones model and clip when strenght > 0.0
          del model, text_encoder
        model = patched_model
        text_encoder = patched_clip

      positive = CLIPTextEncode().encode(text_encoder, positive_prompt)[0]
      negative = CLIPTextEncode().encode(text_encoder, negative_prompt)[0]

      #pdb.set_trace() #for checking VRAM usage at this point

      del text_encoder

      empty_latent_video = EmptyHunyuanLatentVideo().generate(
          width, height, frames, batch_size=1)[0]

      print(f"Loading...{vae_name}")
      vae = VAELoader().load_vae(vae_name)[0]

      final_model = ModelSamplingSD3().patch(model, 8.0)[0]

      del model

      #pdb.set_trace() #for checking VRAM usage at this point

      print(f"Sampling...")
      sampled = KSampler().sample(
          model=final_model,
          seed=seed,
          steps=steps,
          cfg=cfg_scale,
          sampler_name=sampler_name,
          scheduler=scheduler,
          positive=positive,
          negative=negative,
          latent_image=empty_latent_video
      )[0]

      del positive, negative
      del final_model

      decoded = VAEDecode().decode(vae, sampled)[0]

      if frames == 1:
          output_path = save_as_image(decoded[0], "ComfyUI")
          display(IPImage(filename=output_path))
      else:
          if output_format.lower() == "mp4":
              output_path = save_as_mp4(decoded, "ComfyUI", fps)
              display(IPVideo(output_path, embed=True))
          else: # Defaulting to image sequence
              output_images = save_as_image_sequence(decoded, "ComfyUI")
              display(IPImage(filename=output_images[0]))

      del vae, decoded
      clear_memory()

print("✅ Logic Complete!")

In [None]:
# @title Generate Video

positive_prompt = "a black kitten playing with a ball of yarn" # @param {"type":"string"}
negative_prompt = "overexposed, blurred details, subtitles, worst quality, low quality, JPEG compression artifacts, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, fused fingers, still picture, three legs, walking backwards, watermark, text, signature" # @param {"type":"string"}
width = 480 # @param {"type":"number"}
height = 832 # @param {"type":"number"}
seed = 42 # @param {"type":"integer"}
steps = 4 # @param {"type":"integer", "min":1, "max":100}
cfg_scale = 1.0 # @param {"type":"number", "min":1, "max":20}
sampler_name = "uni_pc" # @param ["uni_pc", "euler", "dpmpp_2m", "ddim", "lms"]
scheduler = "simple" # @param ["simple", "normal", "karras", "exponential"]
frames = 33 # @param {"type":"integer", "min":1, "max":121}
fps = 12 # @param {"type":"integer", "min":1, "max":60}
output_format = "mp4" # @param ["mp4", "image_sequence"]

import random
seed = seed if seed else random.randint(0, 2**32 - 1)
print(f"Using seed: {seed}")

generate_video(
    positive_prompt=positive_prompt,
    negative_prompt=negative_prompt,
    width=width,
    height=height,
    seed=seed,
    steps=steps,
    cfg_scale=cfg_scale,
    sampler_name=sampler_name,
    scheduler=scheduler,
    frames=frames,
    fps=fps,
    output_format=output_format
)
clear_memory()

In [None]:
# @title Clear Memory in case of stopping execution
clear_memory()