In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import json
import logging
import math
import shutil
import time
from pathlib import Path
from typing import Union

import numpy as np
from PIL import Image
from omegaconf import OmegaConf
import wandb
import torch
from torch.optim import AdamW
#from lightning.pytorch.utilities import CombinedLoader
from pytorch_lightning.utilities import CombinedLoader

from transformers import AutoTokenizer
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import DistributedType, set_seed

#from training.data import Text2ImageDataset
#from training.imagenet_dataset import ImageNetDataset
#from parquet import RefinedWebDataset
from fashionrec_dataset import FashionRecDataset
from fashion_image_generation_dataset import FashionImageGenerationDataset

from models import Showo, MAGVITv2, get_mask_chedule
from training.prompting_utils import UniversalPrompting, create_attention_mask_predict_next, create_attention_mask_for_mmu
from models.lr_schedulers import get_scheduler
from models.logging import set_verbosity_info, set_verbosity_error

from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from llava.llava_data_vq_unified import get_instruct_data_loader
from training.utils import get_config, flatten_omega_conf, mask_or_random_replace_tokens, AverageMeter

SYSTEM_PROMPT_LEN = 28
logger = get_logger(__name__, log_level="INFO")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from omegaconf import OmegaConf
config = OmegaConf.load('configs/fashionm3_training.yaml')

# Enable TF32 on Ampere GPUs
if config.training.enable_tf32:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

config.experiment.logging_dir = str(Path(config.experiment.output_dir) / "logs")

# Initialize accelerator
accelerator = Accelerator(
    gradient_accumulation_steps=config.training.gradient_accumulation_steps,
    mixed_precision=config.training.mixed_precision,
    log_with="wandb",
    project_dir=config.experiment.logging_dir,
    split_batches=True,
)

# Calculate batch sizes
total_batch_size_per_gpu = (config.training.batch_size_t2i +
config.training.batch_size_mmu)
total_batch_size = (
    (config.training.batch_size_t2i + config.training.batch_size_mmu)
    * accelerator.num_processes * config.training.gradient_accumulation_steps
)

print(f"Total batch size: {total_batch_size}")
print(f"Per GPU batch size: {total_batch_size_per_gpu}")

Total batch size: 6
Per GPU batch size: 6


In [3]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

logger.info(accelerator.state, main_process_only=False)

if accelerator.is_local_main_process:
    set_verbosity_info()
else:
    set_verbosity_error()

# Initialize experiment tracking
if accelerator.is_main_process:
    os.makedirs(config.experiment.output_dir, exist_ok=True)
    OmegaConf.save(config, Path(config.experiment.output_dir) / "config.yaml")

    # Convert OmegaConf to dict for wandb
    config_dict = OmegaConf.to_container(config, resolve=True)

    accelerator.init_trackers(
        config.experiment.project,
        config=config_dict,  
        init_kwargs={"wandb": {"name": config.experiment.name}},
    )

# Set seed for reproducibility
set_seed(42)

08/04/2025 13:41:59 - INFO - __main__ - Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: fp16

[34m[1mwandb[0m: Currently logged in as: [33mjonathanhuang781[0m ([33muhhidk[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.model.showo.llm_model_path,
padding_side="left")
uni_prompting = UniversalPrompting(tokenizer,
max_text_len=config.dataset.preprocessing.max_seq_length,
                                special_tokens=("<|soi|>", "<|eoi|>",
"<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
                                ignore_id=-100,
cond_dropout_prob=config.training.cond_dropout_prob)

# Load VQ model
def get_vq_model_class(model_type):
    if model_type == "magvitv2":
        return MAGVITv2
    else:
        raise ValueError(f"model_type {model_type} not supported.")

vq_model_class = get_vq_model_class(config.model.vq_model.type)
vq_model = vq_model_class.from_pretrained(config.model.vq_model.vq_model_name)
vq_model.requires_grad_(False)
vq_model.eval()

# Load main model
model = Showo.from_pretrained(config.model.showo.pretrained_model_path, **config.model.showo)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"VQ Model parameters: {sum(p.numel() for p in vq_model.parameters()):,}")

The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.


Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192




attention implementation:  sdpa


  if self.w_clip_vit:
All model checkpoint weights were used when initializing Showo.

Some weights of Showo were not initialized from the model checkpoint at C:\Users\jonat\Desktop\StyleAI\backend\fashionm3\Show-o\models\show-o-512x512-wo-llava-tuning and are newly initialized: ['mm_projector.2.weight', 'mm_projector.2.bias', 'mm_projector.0.bias', 'mm_projector.0.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model parameters: 1,454,472,322
VQ Model parameters: 95,387,004


In [5]:
# Setup dataset configurations
dataset_config = config.dataset.params
preproc_config = config.dataset.preprocessing

# Calculate total batch sizes
total_batch_size_t2i = config.training.batch_size_t2i * accelerator.num_processes * config.training.gradient_accumulation_steps
total_batch_size_mmu = config.training.batch_size_mmu * accelerator.num_processes * config.training.gradient_accumulation_steps

print(f"T2I batch size: {total_batch_size_t2i}")
print(f"MMU batch size: {total_batch_size_mmu}")

# Setup Fashion Image Generation Dataset (T2I flow)
if config.dataset.gen_type == "fashion_image_generation":
    print("Setting up Fashion Image Generation dataset...")

    dataset_fashion_img = FashionImageGenerationDataset(
        data_root=dataset_config.train_t2i_shards_path_or_url,
        split="train"
    )

    print(f"Fashion Image Generation dataset size: {len(dataset_fashion_img)}")

    # Create dataloader
    if accelerator.num_processes > 1:
        sampler = DistributedSampler(dataset_fashion_img,
                                    num_replicas=accelerator.num_processes,
                                    rank=accelerator.process_index,
                                    shuffle=True)
        shuffle = False
    else:
        sampler = None
        shuffle = True

    train_dataloader_t2i = DataLoader(
        dataset_fashion_img,
        batch_size=config.training.batch_size_t2i,
        sampler=sampler,
        shuffle=shuffle,
        num_workers=dataset_config.num_workers
    )

    num_update_steps_per_epoch = math.ceil(len(dataset_fashion_img) / total_batch_size_t2i)
    num_train_epochs = math.ceil(config.training.max_train_steps / num_update_steps_per_epoch)

    print(f"Steps per epoch: {num_update_steps_per_epoch}")
    print(f"Estimated epochs: {num_train_epochs}")

else:
    raise ValueError(f"Unsupported gen_type: {config.dataset.gen_type}")

T2I batch size: 2
MMU batch size: 4
Setting up Fashion Image Generation dataset...
Found 12 tar files
Total samples loaded: 120000
FashionImageGenerationDataset loaded 120000 samples
Fashion Image Generation dataset size: 120000
Steps per epoch: 60000
Estimated epochs: 1


In [6]:
# Setup FashionRec Dataset (MMU flow)
if config.dataset.und_type == "fashionrec":
    print("Setting up FashionRec dataset...")

    # Create dataset with only the parameters it expects
    dataset_fashionrec = FashionRecDataset(
        data_root=dataset_config.fashionrec_data_root,
        split="train",
        task_weights=dataset_config.task_weights
    )

    print(f"FashionRec dataset size: {len(dataset_fashionrec)}")

    # Create dataloader
    if accelerator.num_processes > 1:
        sampler_mmu = DistributedSampler(dataset_fashionrec,
                                        num_replicas=accelerator.num_processes,
                                        rank=accelerator.process_index,
                                        shuffle=True)
        shuffle_mmu = False
    else:
        sampler_mmu = None
        shuffle_mmu = True

    train_dataloader_mmu = DataLoader(
        dataset_fashionrec,
        batch_size=config.training.batch_size_mmu,
        sampler=sampler_mmu,
        shuffle=shuffle_mmu,
        num_workers=dataset_config.num_workers
        # Remove collate_fn for now - test if it's needed
    )

    print(f"MMU dataloader created with batch size: {config.training.batch_size_mmu}")

else:
    raise ValueError(f"Unsupported und_type: {config.dataset.und_type}")

Setting up FashionRec dataset...
FashionRec dataset size: 303937
MMU dataloader created with batch size: 4


In [8]:
# Test both datasets with small batches
print("Testing datasets...")

# Test T2I dataset
try:
    t2i_batch = next(iter(train_dataloader_t2i))
    print(f"T2I batch keys: {t2i_batch.keys()}")
    print(f"T2I pixel_values shape: {t2i_batch['pixel_values'].shape}")
    print(f"T2I pixel_values dtype: {t2i_batch['pixel_values'].dtype}")
    print(f"T2I input_ids type: {type(t2i_batch['input_ids'])}")  
    print(f"T2I input_ids length: {len(t2i_batch['input_ids'])}")  
    print(f"T2I first description: {t2i_batch['input_ids'][0][:100]}...") 
except Exception as e:
    print(f"T2I dataset error: {e}")

# Test MMU dataset
try:
    mmu_batch = next(iter(train_dataloader_mmu))
    print(f"MMU batch keys: {mmu_batch.keys()}")
    print(f"MMU pixel_values shape: {mmu_batch['pixel_values'].shape}")
    print(f"MMU pixel_values dtype: {mmu_batch['pixel_values'].dtype}")
    print(f"MMU input_ids type: {type(mmu_batch['input_ids'])}") 
    print(f"MMU input_ids length: {len(mmu_batch['input_ids'])}")
    print(f"MMU tasks: {mmu_batch['task']}")  # List of task names
    print(f"MMU first conversation: {mmu_batch['input_ids'][0][:100]}...") 
except Exception as e:
    print(f"MMU dataset error: {e}")

print("\nDataset setup complete!")

Testing datasets...
✅ T2I batch keys: dict_keys(['pixel_values', 'input_ids'])
✅ T2I pixel_values shape: torch.Size([2, 3, 512, 512])
✅ T2I pixel_values dtype: torch.float32
✅ T2I input_ids type: <class 'list'>
✅ T2I input_ids length: 2
✅ T2I first description: A loafer. The shoes are made of dark blue velvet with gold embroidery at the top, featuring a slip-o...
✅ MMU batch keys: dict_keys(['pixel_values', 'input_ids', 'task'])
✅ MMU pixel_values shape: torch.Size([4, 3, 512, 512])
✅ MMU pixel_values dtype: torch.float32
✅ MMU input_ids type: <class 'list'>
✅ MMU input_ids length: 4
✅ MMU tasks: ['basic_recommendation', 'personalized_recommendation', 'personalized_recommendation', 'alternative_recommendation']
✅ MMU first conversation: human: I uploaded a picture of my outfit. What kind of jeans would go well with my plaid wool coat a...

Dataset setup complete! ✅


In [10]:
# Setup optimizer and scheduler
from torch.optim import AdamW
from models.lr_schedulers import get_scheduler

# Prepare model for training
if config.model.gradient_checkpointing:
    model._set_gradient_checkpointing(model, value=True)

# Create optimizer
optimizer = AdamW(
    model.parameters(),
    lr=config.optimizer.params.learning_rate,
    weight_decay=config.optimizer.params.weight_decay,
    betas=(config.optimizer.params.beta1, config.optimizer.params.beta2),
    eps=config.optimizer.params.epsilon
)

# Create learning rate scheduler
lr_scheduler = get_scheduler(
    config.lr_scheduler.scheduler,
    optimizer,
    num_warmup_steps=config.lr_scheduler.params.warmup_steps,
    num_training_steps=config.training.max_train_steps
)

print(f"Optimizer: {type(optimizer).__name__}")
print(f"LR Scheduler: {config.lr_scheduler.scheduler}")
print(f"Learning rate: {config.optimizer.params.learning_rate}")

Optimizer: AdamW
LR Scheduler: constant_with_warmup
Learning rate: 0.0001


In [12]:
# Prepare model, optimizer, scheduler, and dataloaders with accelerator
print("Preparing components with accelerator...")

# Prepare everything for distributed training
model, optimizer, lr_scheduler, train_dataloader_t2i, train_dataloader_mmu = accelerator.prepare(model, optimizer, lr_scheduler, train_dataloader_t2i, train_dataloader_mmu)

# Move VQ model to the same device
vq_model = vq_model.to(accelerator.device)

print(f"Model on device: {accelerator.device}")
print(f"VQ model on device: {next(vq_model.parameters()).device}")
print(f"Mixed precision: {accelerator.mixed_precision}")
print(f"Ready for training!")

Preparing components with accelerator...
Model on device: cuda
VQ model on device: cuda:0
Mixed precision: fp16
Ready for training!


In [13]:
# Training loop setup
from training.utils import AverageMeter
import time

# Calculate training parameters
num_update_steps_per_epoch = math.ceil(len(dataset_fashion_img) / (config.training.batch_size_t2i * accelerator.num_processes))
max_train_steps = config.training.max_train_steps
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)

print(f"Training Configuration:")
print(f"Dataset sizes: T2I={len(dataset_fashion_img)}, MMU={len(dataset_fashionrec)}")
print(f"Steps per epoch: {num_update_steps_per_epoch}")
print(f"Max training steps: {max_train_steps}")
print(f"Estimated epochs: {num_train_epochs}")
print(f"Gradient accumulation: {config.training.gradient_accumulation_steps}")

# Initialize metrics tracking
loss_meter_t2i = AverageMeter()
loss_meter_mmu = AverageMeter()
loss_meter_total = AverageMeter()

# Set models to training mode
model.train()
vq_model.eval()  # VQ model stays in eval mode

print(f"Training setup complete!")

Training Configuration:
Dataset sizes: T2I=120000, MMU=303937
Steps per epoch: 60000
Max training steps: 50000
Estimated epochs: 1
Gradient accumulation: 1
Training setup complete!


In [14]:
# Test a single training step to make sure everything works
print("Testing single training step...")

try:
    # Get single batches from both dataloaders
    t2i_batch = next(iter(train_dataloader_t2i))
    mmu_batch = next(iter(train_dataloader_mmu))

    print(f"T2I batch: pixel_values {t2i_batch['pixel_values'].shape}, {len(t2i_batch['input_ids'])} texts")
    print(f"MMU batch: pixel_values {mmu_batch['pixel_values'].shape}, {len(mmu_batch['input_ids'])} conversations")

    # Test tokenization and processing
    with torch.no_grad():
        # Process T2I batch
        t2i_images = t2i_batch['pixel_values']
        t2i_image_tokens = vq_model.get_code(t2i_images)
        t2i_image_tokens = t2i_image_tokens + len(uni_prompting.text_tokenizer)

        print(f"T2I image tokens shape: {t2i_image_tokens.shape}")
        print(f"T2I image tokens range: {t2i_image_tokens.min().item()} to {t2i_image_tokens.max().item()}")

        # Process MMU batch
        mmu_images = mmu_batch['pixel_values']
        mmu_image_tokens = vq_model.get_code(mmu_images)
        mmu_image_tokens = mmu_image_tokens + len(uni_prompting.text_tokenizer)

        print(f"MMU image tokens shape: {mmu_image_tokens.shape}")
        print(f"MMU image tokens range: {mmu_image_tokens.min().item()} to {mmu_image_tokens.max().item()}")

    print("Single step test passed! Ready for training loop.")

except Exception as e:
    print(f"Single step test failed: {e}")
    import traceback
    traceback.print_exc()

Testing single training step...
T2I batch: pixel_values torch.Size([2, 3, 512, 512]), 2 texts
MMU batch: pixel_values torch.Size([4, 3, 512, 512]), 4 conversations
Single step test failed: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 7.06 GiB is allocated by PyTorch, and 48.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Traceback (most recent call last):
  File "C:\Users\jonat\AppData\Local\Temp\ipykernel_40340\3826167698.py", line 16, in <module>
    t2i_image_tokens = vq_model.get_code(t2i_images)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jonat\Desktop\StyleAI\backend\fashionm3\Show-o\models\modeling_magvitv2.py", line 424, in get_code
    hidden_states = self.encoder(pixel_values)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jonat\Desktop\StyleAI\venv_312\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jonat\Desktop\StyleAI\venv_312\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jonat\Desktop\StyleAI\backend\fashionm3\Show-o\models\modeling_magvitv2.py", line 151, in forward
    h = self.down[i_l