In [1]:
!pip install datasets
!pip install diffusers transformers
!pip install bitsandbytes

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is 

In [5]:
%%writefile train_naruto_lora.py
import os
import torch
import random
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler, StableDiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.optimization import get_scheduler
from tqdm import tqdm
import gc
from datetime import timedelta
import numpy as np

# Import LoRA modules
from peft import LoraConfig, get_peft_model, PeftModel
from bitsandbytes.optim import AdamW8bit

# Force PyTorch to use more aggressive memory release
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# ---------------------- Memory Optimization Functions ----------------------
def free_memory():
    """Force release of unused memory"""
    gc.collect()
    torch.cuda.empty_cache()

# ---------------------- Dataset Preparation ----------------------
class NarutoDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Extract image and caption
        image = item["image"].convert("RGB")
        caption = item["text"]
        
        # Apply image transformations if specified
        if self.transform:
            image = self.transform(image)
        
        return {
            "pixel_values": image,
            "caption": caption
        }

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    captions = [example["caption"] for example in examples]
    
    # Tokenize captions
    inputs = tokenizer(
        captions, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
    )
    
    return {
        "pixel_values": pixel_values,
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
    }

# Function to generate sample images during training
def generate_sample_images(unet, vae, text_encoder, tokenizer, scheduler, device, prompts, output_dir, step):
    # Create a pipeline for inference - fixed to include all required components
    pipe = StableDiffusionPipeline.from_pretrained(
        "CompVis/stable-diffusion-v1-4",
        torch_dtype=torch.float16
    )
    
    # Then replace components with our trained ones
    pipe.unet = unet
    pipe.vae = vae
    pipe.text_encoder = text_encoder
    pipe.tokenizer = tokenizer
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.to(device)
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate images
    with torch.no_grad():
        for i, prompt in enumerate(prompts):
            try:
                # Add negative prompt for better quality
                negative_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality, low quality, blurry"
                
                image = pipe(
                    prompt, 
                    negative_prompt=negative_prompt,
                    num_inference_steps=50,
                    guidance_scale=7.5
                ).images[0]
                
                # Save image
                save_path = os.path.join(output_dir, f"sample_{step}_{i}.png")
                image.save(save_path)
                print(f"Generated sample image: {save_path}")
            except Exception as e:
                print(f"Error generating image for prompt '{prompt}': {e}")

# ---------------------- Training Function ----------------------
def train(image_size=128, progressive_training=True):
    # Check if distributed training environment
    is_distributed = int(os.environ.get("WORLD_SIZE", "1")) > 1
    
    if is_distributed:
        # Initialize process group
        local_rank = int(os.environ.get("LOCAL_RANK", "0"))
        torch.cuda.set_device(local_rank)
        torch.distributed.init_process_group(backend="nccl")
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        is_main_process = rank == 0
        print(f"Distributed training initialized successfully: Rank {rank}/{world_size}, Local Rank: {local_rank}")
    else:
        # Single GPU training
        local_rank = 0
        is_main_process = True
        print("Single GPU training mode")
    
    device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
    
    # Free memory before loading components
    free_memory()
    
    model_name = "CompVis/stable-diffusion-v1-4"
    global tokenizer
    
    if is_main_process:
        print("Loading tokenizer...")
    tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
    
    if is_main_process:
        print("Loading text encoder...")
    text_encoder = CLIPTextModel.from_pretrained(
        model_name, 
        subfolder="text_encoder"
    ).to(device)
    
    if is_main_process:
        print("Loading VAE...")
    vae = AutoencoderKL.from_pretrained(
        model_name, 
        subfolder="vae"
    ).to(device)
    
    if is_main_process:
        print("Loading UNet...")
    # Load UNet and enable gradient checkpointing to save memory
    unet = UNet2DConditionModel.from_pretrained(
        model_name, 
        subfolder="unet"
    ).to(device)
    
    # Enable gradient checkpointing, significantly reducing memory usage
    unet.enable_gradient_checkpointing()
    
    if is_main_process:
        print("Loading scheduler...")
    noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder="scheduler")

    # Freeze all parameters
    text_encoder.requires_grad_(False)
    vae.requires_grad_(False)
    unet.requires_grad_(False)
    
    # Add LoRA adapters to UNet
    if is_main_process:
        print("Adding LoRA adapters to UNet...")
    
    # LoRA configuration - carefully tuned to avoid degradation
    lora_config = LoraConfig(
        r=16,  # Higher rank for better quality
        lora_alpha=32,  # Alpha is typically 2x the rank
        target_modules=[
            "to_q", "to_k", "to_v", "to_out.0",  # Attention modules
            "conv1", "conv2"  # Convolutional layers
        ],
        lora_dropout=0.05,
        bias="none"
    )
    
    # Apply LoRA to UNet
    unet = get_peft_model(unet, lora_config)
    
    # Print LoRA trainable parameters
    if is_main_process:
        trainable_params = sum(p.numel() for p in unet.parameters() if p.requires_grad)
        all_params = sum(p.numel() for p in unet.parameters())
        print(f"LoRA trainable parameters: {trainable_params:,} / {all_params:,} ({100 * trainable_params / all_params:.2f}%)")

    # Ensure UNet is in training mode
    unet.train()

    # Wrap model for distributed training
    if is_distributed:
        unet = torch.nn.parallel.DistributedDataParallel(
            unet,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True
        )
    
    # Load the Naruto dataset
    naruto_dataset = load_dataset("lambdalabs/naruto-blip-captions")
    
    # Set up image transform
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ])
    
    # Create custom dataset
    dataset = NarutoDataset(naruto_dataset["train"], transform=transform)
    
    # Batch settings - adjusted for LoRA which needs less memory
    batch_size = 2  # Can use larger batch size with LoRA
    gradient_accumulation_steps = 4  # Can use fewer accumulation steps
    
    # Set up distributed sampler (if using distributed training)
    if is_distributed:
        sampler = torch.utils.data.distributed.DistributedSampler(dataset)
    else:
        sampler = None
    
    train_dataloader = DataLoader(
        dataset, 
        batch_size=batch_size,
        sampler=sampler,
        shuffle=(sampler is None),
        collate_fn=collate_fn,
        num_workers=0,
        pin_memory=True
    )

    # Training settings - more epochs for LoRA
    num_epochs = 10
    
    # Calculate total training steps
    num_update_steps_per_epoch = len(train_dataloader) // gradient_accumulation_steps
    if len(train_dataloader) % gradient_accumulation_steps != 0:
        num_update_steps_per_epoch += 1
    max_train_steps = num_epochs * num_update_steps_per_epoch
    
    # Use 8-bit Adam for memory efficiency 
    if is_distributed:
        optimizer = AdamW8bit(
            filter(lambda p: p.requires_grad, unet.module.parameters()),
            lr=1e-4,  # Higher learning rate for LoRA is fine
            weight_decay=1e-2  # Add weight decay to reduce overfitting
        )
    else:
        optimizer = AdamW8bit(
            filter(lambda p: p.requires_grad, unet.parameters()),
            lr=1e-4,  # Higher learning rate for LoRA is fine
            weight_decay=1e-2  # Add weight decay to reduce overfitting
        )
    
    # Use linear warmup and cosine decay learning rate schedule
    lr_scheduler = get_scheduler(
        "cosine",  # Cosine decay
        optimizer=optimizer,
        num_warmup_steps=max(100, int(0.05 * max_train_steps)),  # 5% warmup steps
        num_training_steps=max_train_steps
    )

    # Free memory before training
    free_memory()
    
    if is_main_process:
        print(f"Starting training with LoRA, device: {device}")
        print(f"Image size: {image_size}x{image_size}")
        print(f"Batch size: {batch_size}, Gradient accumulation: {gradient_accumulation_steps}")
        print(f"Updates per epoch: {num_update_steps_per_epoch}")
        print(f"Total training steps: {max_train_steps}")
        print(f"Number of epochs: {num_epochs}")
    
    # Create checkpoint directory
    checkpoint_dir = "./naruto_lora_checkpoints"
    if is_main_process:
        os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Define sample prompts for progress monitoring
    sample_prompts = [
        "Naruto Uzumaki using Rasengan, high quality anime",
        "Sasuke Uchiha with Sharingan, detailed anime style"
    ]
    
    # Setup for progressive training (if enabled)
    if progressive_training:
        # Define a schedule of resolutions
        image_sizes = [128, 256, 384, 512]
        size_schedule = []
        
        # Create a distribution of image sizes with more steps at higher resolutions
        epochs_per_size = [int(num_epochs * x) for x in [0.2, 0.3, 0.2, 0.3]]  # 20%, 30%, 20%, 30%
        
        for size, num_size_epochs in zip(image_sizes, epochs_per_size):
            size_schedule.extend([size] * num_size_epochs)
            
        # Ensure we have enough sizes for all epochs (in case of rounding)
        while len(size_schedule) < num_epochs:
            size_schedule.append(image_sizes[-1])
    
    global_step = 0
    
    for epoch in range(num_epochs):
        # Update image size for progressive training
        if progressive_training and epoch > 0:
            current_size = size_schedule[epoch]
            if current_size != image_size:
                image_size = current_size
                if is_main_process:
                    print(f"Changing image size to {image_size}x{image_size}")
                
                # Update the transform
                transform = transforms.Compose([
                    transforms.Resize((image_size, image_size)),
                    transforms.ToTensor(),
                    transforms.Normalize([0.5], [0.5]),
                ])
                
                # Recreate dataset and dataloader with new image size
                dataset = NarutoDataset(naruto_dataset["train"], transform=transform)
                
                if is_distributed:
                    sampler = torch.utils.data.distributed.DistributedSampler(dataset)
                else:
                    sampler = None
                
                train_dataloader = DataLoader(
                    dataset, 
                    batch_size=batch_size,
                    sampler=sampler,
                    shuffle=(sampler is None),
                    collate_fn=collate_fn,
                    num_workers=0,
                    pin_memory=True
                )
                
                # Recalculate steps per epoch
                num_update_steps_per_epoch = len(train_dataloader) // gradient_accumulation_steps
                if len(train_dataloader) % gradient_accumulation_steps != 0:
                    num_update_steps_per_epoch += 1
        
        # Set epoch for sampler in distributed training
        if is_distributed:
            sampler.set_epoch(epoch)
        
        unet.train()
        if is_main_process:
            progress_bar = tqdm(total=num_update_steps_per_epoch)
            progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs} - Size {image_size}x{image_size}")
        
        optimizer.zero_grad()  # Ensure gradients are zero at start
        
        epoch_loss = 0.0
        steps_this_epoch = 0
        
        for step, batch in enumerate(train_dataloader):
            # Periodically free memory
            if step % 20 == 0:  
                free_memory()
                
            # Move batch data to appropriate device
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)

            with torch.no_grad():
                # Get text embeddings
                text_embeddings = text_encoder(input_ids)[0]
                
                # Get image latent representations
                latents = vae.encode(pixel_values).latent_dist.sample() * 0.18215
                
                # Add noise
                noise = torch.randn_like(latents)
                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],), device=device)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
            
            try:
                # Predict noise with LoRA-augmented UNet
                if is_distributed:
                    noise_pred = unet.module(noisy_latents, timesteps, text_embeddings).sample
                else:
                    noise_pred = unet(noisy_latents, timesteps, text_embeddings).sample
                
                # Calculate loss (scaled by gradient accumulation steps)
                loss = torch.nn.functional.mse_loss(noise_pred, noise) / gradient_accumulation_steps
                
                # Standard backpropagation
                loss.backward()
                
                # Track loss
                epoch_loss += loss.item() * gradient_accumulation_steps
                steps_this_epoch += 1
                
                # Apply gradient accumulation
                if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                    # Clip gradients for stability
                    if is_distributed:
                        torch.nn.utils.clip_grad_norm_(unet.module.parameters(), max_norm=1.0)
                    else:
                        torch.nn.utils.clip_grad_norm_(unet.parameters(), max_norm=1.0)
                    
                    # Update weights
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
                    
                    # Synchronize all devices (distributed training)
                    if is_distributed:
                        torch.distributed.barrier()
                    
                    # Update progress bar
                    if is_main_process:
                        progress_bar.update(1)
                        progress_bar.set_postfix(loss=loss.detach().item() * gradient_accumulation_steps)
                    
                    global_step += 1
                    
                    # Save checkpoint and generate samples periodically
                    if is_main_process and global_step % 200 == 0:  
                        ckpt_path = os.path.join(checkpoint_dir, f"checkpoint-{global_step}")
                        os.makedirs(ckpt_path, exist_ok=True)
                        
                        # Save LoRA weights
                        if is_distributed:
                            unet_to_save = unet.module
                        else:
                            unet_to_save = unet
                            
                        unet_to_save.save_pretrained(ckpt_path)
                        print(f"Saved LoRA checkpoint to {ckpt_path}")
                        
                        # Generate and save sample images to monitor progress
                        # First switch model to eval mode
                        unet_eval = unet_to_save
                        unet_eval.eval()
                        
                        # Generate samples
                        generate_sample_images(
                            unet_eval, 
                            vae, 
                            text_encoder, 
                            tokenizer, 
                            noise_scheduler, 
                            device,
                            sample_prompts,
                            os.path.join(checkpoint_dir, "samples"),
                            global_step
                        )
                        
                        # Return to training mode
                        unet_eval.train()
            except Exception as e:
                if is_main_process:
                    print(f"Training step error: {e}")
                free_memory()
                continue
        
        # Calculate average loss for this epoch
        if steps_this_epoch > 0:
            avg_epoch_loss = epoch_loss / steps_this_epoch
            if is_main_process:
                print(f"Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_epoch_loss:.6f}")
        
        # Save model after each epoch
        if is_main_process:
            epoch_dir = os.path.join(checkpoint_dir, f"epoch-{epoch}")
            os.makedirs(epoch_dir, exist_ok=True)
            
            if is_distributed:
                unet_to_save = unet.module
            else:
                unet_to_save = unet
                
            unet_to_save.save_pretrained(epoch_dir)
            print(f"Saved LoRA model for Epoch {epoch+1} to {epoch_dir}")
            
            # Generate sample images at the end of each epoch
            unet_eval = unet_to_save
            unet_eval.eval()
            
            generate_sample_images(
                unet_eval, 
                vae, 
                text_encoder, 
                tokenizer, 
                noise_scheduler, 
                device,
                sample_prompts,
                os.path.join(checkpoint_dir, "epoch_samples"),
                f"epoch_{epoch+1}"
            )
            
            # Return to training mode
            unet_eval.train()

    # Save final model
    if is_main_process:
        output_dir = "./naruto_lora_final"
        os.makedirs(output_dir, exist_ok=True)
        
        if is_distributed:
            unet_to_save = unet.module
        else:
            unet_to_save = unet
            
        unet_to_save.save_pretrained(output_dir)
        print(f"Final LoRA model saved to {output_dir}")
        
        # Generate final samples with more prompts
        final_prompts = [
            "Naruto Uzumaki using Rasengan, detailed anime style",
            "Sasuke Uchiha with Sharingan, high quality anime art",
            "Kakashi Hatake using Chidori, vibrant anime style",
            "Hinata Hyuga with Byakugan activated, detailed anime art",
            "Gaara controlling sand, official Naruto art style"
        ]
        
        unet_eval = unet_to_save
        unet_eval.eval()
        
        generate_sample_images(
            unet_eval, 
            vae, 
            text_encoder, 
            tokenizer, 
            noise_scheduler, 
            device,
            final_prompts,
            os.path.join(output_dir, "final_samples"),
            "final"
        )

    # Clean up distributed environment
    if is_distributed:
        torch.distributed.destroy_process_group()

    # Complete training
    if is_main_process:
        print("LoRA training completed!")
        
    return unet, vae, text_encoder, tokenizer, noise_scheduler

# Function to load and merge LoRA weights with base model
def load_lora_weights(base_model_path, lora_model_path, device):
    """Load and merge LoRA weights with the base model for inference"""
    # Load base UNet
    unet = UNet2DConditionModel.from_pretrained(
        base_model_path,
        subfolder="unet"
    )
    
    # Load LoRA model
    unet = PeftModel.from_pretrained(unet, lora_model_path)
    
    # Merge weights for faster inference
    unet = unet.merge_and_unload()
    
    # Move to specified device
    unet = unet.to(device)
    
    return unet

# Function to use the trained model for inference
def generate_image(prompt, unet, vae, text_encoder, tokenizer, scheduler, device, 
                   output_path=None, height=512, width=512, num_inference_steps=50, guidance_scale=7.5):
    # Create a pipeline for inference
    pipe = StableDiffusionPipeline.from_pretrained(
        "CompVis/stable-diffusion-v1-4",
        torch_dtype=torch.float16
    )
    
    # Replace with our trained components
    pipe.unet = unet
    pipe.vae = vae
    pipe.text_encoder = text_encoder
    pipe.tokenizer = tokenizer
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.to(device)
    
    # Add negative prompt for better quality
    negative_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality, low quality, blurry"
    
    # Generate image with high quality settings
    with torch.no_grad():
        image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            height=height,
            width=width,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale
        ).images[0]
    
    # Save image if path specified
    if output_path:
        image.save(output_path)
        print(f"Image saved to {output_path}")
    
    return image

# ---------------------- Main Entry ----------------------
if __name__ == "__main__":
    # Set random seeds for reproducibility
    random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    # Run training with progressive image resizing
    unet, vae, text_encoder, tokenizer, scheduler = train(image_size=128, progressive_training=True)
    
    # Test the model with some prompts
    prompts = [
        "Naruto using rasengan against Sasuke, high quality anime style",
        "Sasuke using chidori, detailed anime art",
        "Kakashi with Sharingan exposed, professional anime illustration",
        "Team 7 posing together, high resolution anime"
    ]
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    for i, prompt in enumerate(prompts):
        generate_image(
            prompt=prompt,
            unet=unet,
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            scheduler=scheduler,
            device=device,
            output_path=f"./lora_generated_samples/final_test_{i}.png"
        )

Writing train_naruto_lora.py


In [6]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128 python -m torch.distributed.launch --nproc_per_node=2 train_naruto_lora.py

and will be removed in future. Use torchrun.
Note that --use-env is set by default in torchrun.
If your script expects `--local-rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See 
https://pytorch.org/docs/stable/distributed.html#launch-utility for 
further instructions

  main()
W0418 17:32:49.927000 113 torch/distributed/run.py:793] 
W0418 17:32:49.927000 113 torch/distributed/run.py:793] *****************************************
W0418 17:32:49.927000 113 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0418 17:32:49.927000 113 torch/distributed/run.py:793] *****************************************
2025-04-18 17:33:03.553041: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory 

In [8]:
import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from peft import PeftModel
from PIL import Image
import os

# Create output directory
output_dir = "generated_images"
os.makedirs(output_dir, exist_ok=True)

def load_lora_model(lora_model_path):
    """
    Load the base model and apply LoRA weights
    
    Args:
        lora_model_path: Path to the trained LoRA weights
        
    Returns:
        StableDiffusionPipeline: The loaded pipeline with LoRA weights
    """
    print(f"Loading LoRA weights from: {lora_model_path}")
    
    # Load base model
    base_model_id = "CompVis/stable-diffusion-v1-4"
    pipe = StableDiffusionPipeline.from_pretrained(
        base_model_id, 
        torch_dtype=torch.float16,
        safety_checker=None,
        requires_safety_checker=False
    )
    
    # Load LoRA weights
    pipe.unet = PeftModel.from_pretrained(pipe.unet, lora_model_path)
    
    # Merge weights for faster inference
    print("Merging LoRA weights with base model...")
    pipe.unet = pipe.unet.merge_and_unload()
    
    # Use faster scheduler
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    
    # Move to GPU
    pipe = pipe.to("cuda")
    
    return pipe

2025-04-18 19:23:23.982863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745004204.006030      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745004204.013020      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
def generate_images(pipe, prompts, num_images=1, guidance_scale=7.5, num_inference_steps=50, seed=None):
    """
    Generate images using the provided pipeline and prompts
    
    Args:
        pipe: StableDiffusionPipeline with LoRA weights
        prompts: List of text prompts
        num_images: Number of images to generate per prompt
        guidance_scale: Classifier free guidance scale
        num_inference_steps: Number of denoising steps
        seed: Random seed for reproducibility
        
    Returns:
        List of generated images
    """
    # Set seed for reproducibility if provided
    if seed is not None:
        torch.manual_seed(seed)
    
    # Define high-quality negative prompt
    negative_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality, low quality, blurry, mutated, deformed, disfigured, extra limbs"
    
    # Generate images for each prompt
    all_images = []
    for i, prompt in enumerate(prompts):
        print(f"Generating image {i+1}/{len(prompts)} for prompt: {prompt}")
        
        # Generate multiple images per prompt if requested
        for j in range(num_images):
            # Generate image
            image = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                height=512,
                width=512
            ).images[0]
            
            # Save image
            filename = f"{output_dir}/prompt_{i}_img_{j}.png"
            image.save(filename)
            print(f"Saved image to {filename}")
            
            all_images.append(image)
    
    return all_images

In [None]:
# Path to your trained LoRA model
lora_model_path = "/kaggle/working/naruto_lora_checkpoints/epoch-9"  # or use "./naruto_lora_checkpoints/epoch-9"

# First load the model
pipe = load_lora_model(lora_model_path)

In [15]:
# List of prompts to generate images for
prompts = [
    "naruto and sasuke shaking hands, detailed anime style"
]

# Then generate images using the loaded model
images = generate_images(
    pipe=pipe,
    prompts=prompts,
    num_images=2,
    guidance_scale=8.0,
    num_inference_steps=75,
    seed=42
)

print(f"Successfully generated {len(images)} images!")

Generating image 1/1 for prompt: naruto and sasuke shaking hands, detailed anime style


  0%|          | 0/75 [00:00<?, ?it/s]

Saved image to generated_images/prompt_0_img_0.png


  0%|          | 0/75 [00:00<?, ?it/s]

Saved image to generated_images/prompt_0_img_1.png
Successfully generated 2 images!


In [17]:
import shutil

# Zip your generated images
shutil.make_archive('naruto_images', 'zip', 'generated_images')

# Zip your trained model
shutil.make_archive('naruto_model', 'zip', '/kaggle/working/naruto_lora_checkpoints')

print("Files are ready for download in the Output tab")

Files are ready for download in the Output tab
