In [None]:
%cd /content

# install WAN2.1 code
!git clone https://github.com/Wan-Video/Wan2.1.git

# Commits on Mar 7, 2025
%cd /content/Wan2.1
!git checkout b58b7c573776b76b6fe8d36086590e033173f9b1


In [None]:
%cd /content/Wan2.1

# !pip install -r requirements.txt
!pip install ftfy dashscope
!pip uninstall diffusers -y
!pip install 'git+https://github.com/huggingface/diffusers.git@26149c0ecda67587ffd51f1a91c888388f83253b'


In [None]:
# check versions
!python --version

!pip list | egrep 'torch|ftfy|dashscope|diffusers|transformers'


In [None]:
%cd /content/Wan2.1

from IPython.display import Video
import gc

import torch
import numpy as np
from huggingface_hub import snapshot_download

from diffusers.utils import export_to_video, load_image
from diffusers import AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

from transformers import CLIPVisionModel

# set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device is', device)

!mkdir outputs


#Text-to-Video

In [None]:
%%time

# Wan-AI/Wan2.1-T2V-14B-Diffusers または Wan-AI/Wan2.1-T2V-1.3B-Diffusers のモデルIDを指定
model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"

# VAE（変分オートエンコーダー）の初期化
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16)

# フローシフトの設定：720Pの場合は5.0、480Pの場合は3.0
flow_shift = 5.0

# スケジューラの設定：フロー予測を行うUniPCMultistepSchedulerを使用
scheduler = UniPCMultistepScheduler(
    prediction_type='flow_prediction', 
    use_flow_sigmas=True, 
    num_train_timesteps=1000, 
    flow_shift=flow_shift)

# パイプラインの初期化
pipe = WanPipeline.from_pretrained(
    model_id, 
    vae=vae, 
    torch_dtype=torch.bfloat16)
pipe.scheduler = scheduler
pipe.to(device)


In [None]:
%%time

prompt = "Beautiful robot man walking, High Definition HD, High Detail, UHD Pen and Ink Art, Perfect Composition, Detailed Structure, Crazy Octane Rendering, Photorealism Concept Art,3D Cinematography, Perfect Light, 3D -rendering, famous outstanding typography, 3d render, cinematic."
negative_prompt = "Low quality, blurry, pixelated, distorted, deformed, unrealistic, unnatural lighting, poor resolution, artifacts, glitches, low detail."
seed = 12
generator = torch.Generator(device="cpu").manual_seed(seed)

output = pipe(
     prompt           = prompt,
     negative_prompt  = negative_prompt,
     height           = 720,
     width            = 1280,
     num_frames       = 81,
     guidance_scale   = 5.0,
     generator = generator
    ).frames[0]

export_to_video(output, "./outputs/t2v_output.mp4", fps=16)


In [None]:
Video("./outputs/t2v_output.mp4", embed=True, height=420)


In [None]:
%%time

prompt = "桜の花びらが舞い散る姫路城"
negative_prompt = "低画質"
seed = 12
generator = torch.Generator(device="cpu").manual_seed(seed)

output = pipe(
     prompt           = prompt,
     negative_prompt  = negative_prompt,
     height           = 720,
     width            = 1280,
     num_frames       = 81,
     guidance_scale   = 5.0,
     generator = generator
    ).frames[0]

export_to_video(output, "./outputs/t2v_output_jp.mp4", fps=16)


In [None]:
Video("./outputs/t2v_output_jp.mp4", embed=True, height=420)


#Image to video

In [None]:
# release memory
del pipe
del scheduler
del vae
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()


In [None]:
%%time

# model id
# Wan-AI/Wan2.1-I2V-14B-480P-Diffusers or Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"

# 画像エンコーダのロード
image_encoder = CLIPVisionModel.from_pretrained(
    model_id, 
    subfolder="image_encoder", 
    torch_dtype=torch.float32)

# VAE(Variational Autoencoder)のロード
vae = AutoencoderKLWan.from_pretrained(
    model_id, 
    subfolder="vae", 
    torch_dtype=torch.float32)

# image-to-video pipelineのセットアップ
pipe = WanImageToVideoPipeline.from_pretrained(
    model_id, 
    vae=vae, 
    image_encoder=image_encoder, 
    torch_dtype=torch.bfloat16)


# CPUオフロードを有効化(メモリ節約のため)
pipe.enable_model_cpu_offload()


In [None]:
# ウェブから画像を読み込む
image = load_image(
    "https://user0514.cdnw.net/shared/img/thumb/yuta_240513_031_TP_V4.jpg"
)

# 画像の最大面積を制限(832*480ピクセル)
max_area = 832*480
# 画像の縦横比を計算(元のアスペクト比を維持するため)
aspect_ratio = image.height / image.width
# 画像のリサイズ時の調整値(モデルのスケールファクターとパッチサイズを考慮)
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
# 高さと幅を計算(アスペクト比を維持しつつ、最大面積を超えないように調整)
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
# 計算したサイズにリサイズ
image = image.resize((width, height))

prompt = "Plane taking off. Cinematic, 4K"
negative_prompt = "low quality"

# set seed
seed = 12
generator = torch.Generator(device="cpu").manual_seed(seed)

image


In [None]:
%%time

output = pipe(
    image           = image,
    prompt          = prompt,
    negative_prompt = negative_prompt,
    height          = height,
    width           = width,
    num_frames      = 81,
    guidance_scale  = 5.0,
    generator = generator,
).frames[0]

export_to_video(output, "./outputs/i2v_output.mp4", fps=16)


In [None]:
Video("./outputs/i2v_output.mp4", embed=True, height=420)
