Merge pull request #506 from hpcaitech/docs-fix

Docs fix
hpcaitech · Jun 20, 2024 · cdaa1d6 · cdaa1d6
2 parents aacad8d + b07388a
commit cdaa1d6
Show file tree

Hide file tree

Showing 8 changed files with 78 additions and 9 deletions.
diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py
@@ -19,12 +19,14 @@
     qk_norm=True,
     enable_flash_attn=True,
     enable_layernorm_kernel=True,
+    force_huggingface=True,
 )
 vae = dict(
     type="OpenSoraVAE_V1_2",
     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
     micro_frame_size=17,
     micro_batch_size=4,
+    force_huggingface=True,
 )
 text_encoder = dict(
     type="t5",

diff --git a/configs/opensora-v1-2/train/adapt.py b/configs/opensora-v1-2/train/adapt.py
@@ -46,7 +46,6 @@
     from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=300,
     shardformer=True,
-    local_files_only=True,
 )
 scheduler = dict(
     type="rflow",

diff --git a/configs/opensora-v1-2/train/stage1.py b/configs/opensora-v1-2/train/stage1.py
@@ -72,7 +72,6 @@
     from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=300,
     shardformer=True,
-    local_files_only=True,
 )
 scheduler = dict(
     type="rflow",

diff --git a/configs/opensora-v1-2/train/stage2.py b/configs/opensora-v1-2/train/stage2.py
@@ -52,7 +52,6 @@
     from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=300,
     shardformer=True,
-    local_files_only=True,
 )
 scheduler = dict(
     type="rflow",

diff --git a/configs/opensora-v1-2/train/stage3.py b/configs/opensora-v1-2/train/stage3.py
@@ -52,7 +52,6 @@
     from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=300,
     shardformer=True,
-    local_files_only=True,
 )
 scheduler = dict(
     type="rflow",

diff --git a/configs/opensora-v1-2/train/stage3_480p.py b/configs/opensora-v1-2/train/stage3_480p.py
@@ -0,0 +1,73 @@
+# Dataset settings
+dataset = dict(
+    type="VariableVideoTextDataset",
+    transform_name="resize_crop",
+)
+
+# webvid
+bucket_config = {"480p": {51: (0.5, 5)}}
+grad_checkpoint = True
+
+# Acceleration settings
+num_workers = 0
+num_bucket_build_workers = 16
+dtype = "bf16"
+plugin = "zero2"
+
+# Model settings
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained=None,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    freeze_y_embedder=True,
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+    shardformer=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    sample_method="logit-normal",
+)
+
+# Mask settings
+# 25%
+mask_ratios = {
+    "random": 0.01,
+    "intepolate": 0.002,
+    "quarter_random": 0.002,
+    "quarter_head": 0.002,
+    "quarter_tail": 0.002,
+    "quarter_head_tail": 0.002,
+    "image_random": 0.0,
+    "image_head": 0.22,
+    "image_tail": 0.005,
+    "image_head_tail": 0.005,
+}
+
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 200
+
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 1e-4
+ema_decay = 0.99
+adam_eps = 1e-15
+warmup_steps = 1000
diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
@@ -447,7 +447,7 @@ def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
 
 @MODELS.register_module("STDiT3-XL/2")
 def STDiT3_XL_2(from_pretrained=None, **kwargs):
-    force_huggingface = kwargs.pop("force_huggingface", True)
+    force_huggingface = kwargs.pop("force_huggingface", False)
     if force_huggingface or from_pretrained is not None and not os.path.isdir(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
@@ -460,9 +460,7 @@ def STDiT3_XL_2(from_pretrained=None, **kwargs):
 
 @MODELS.register_module("STDiT3-3B/2")
 def STDiT3_3B_2(from_pretrained=None, **kwargs):
-    # check if from_pretrained is a path
-    force_huggingface = kwargs.pop("force_huggingface", True)
-    if force_huggingface or (from_pretrained is not None and not os.path.isdir(from_pretrained)):
+    if from_pretrained is not None and not os.path.isdir(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)

diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
@@ -252,7 +252,7 @@ def OpenSoraVAE_V1_2(
     local_files_only=False,
     freeze_vae_2d=False,
     cal_loss=False,
-    force_huggingface=True,
+    force_huggingface=False,
 ):
     vae_2d = dict(
         type="VideoAutoencoderKL",