In [None]:
# Import libraries
import os
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPTokenizer
from datasets import load_dataset
import torch
from diffusers import UNet2DModel, DDPMScheduler
import wandb

In [None]:
# Initialise Weights & Biases
wandb.init(project="NatalieDiffusion")


In [None]:
# Load dataset
dataset = load_dataset('NevskyCollective/nataliaXton')

In [None]:
# Check the structure of the dataset
print(dataset)
print(dataset['train'][0])


In [None]:
# Initialize the tokenizer
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

class ImageCaptionDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item['image'].convert("RGB")  # Ensure the image is in RGB format
        caption = f"label {item['label']}"  # Adjust based on actual label usage
        if self.transform:
            image = self.transform(image)
        tokenized_caption = tokenizer(caption, padding="max_length", truncation=True, return_tensors="pt")
        return {
            "pixel_values": image,
            "input_ids": tokenized_caption["input_ids"].squeeze(),
            "attention_mask": tokenized_caption["attention_mask"].squeeze()
        }


In [None]:
# Define image transformations
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])


In [None]:
# Create the dataset
custom_dataset = ImageCaptionDataset(dataset['train'], transform=transform)

# Create a DataLoader
dataloader = DataLoader(custom_dataset, batch_size=8, shuffle=True)


In [None]:
# Load the UNet model and scheduler
model = UNet2DModel.from_pretrained("google/ddpm-cifar10-32")
scheduler = DDPMScheduler.from_config(model.config)


In [None]:
# Define the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 5

try:
    for epoch in range(num_epochs):
        model.train()
        for batch in dataloader:
            optimizer.zero_grad()
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass
            noise = torch.randn_like(pixel_values)
            timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (pixel_values.shape[0],)).to(device)
            noisy_images = scheduler.add_noise(pixel_values, noise, timesteps)
            outputs = model(noisy_images, timesteps)
            loss = torch.nn.functional.mse_loss(outputs.sample, noise)

            loss.backward()
            optimizer.step()

            # Log metrics to W&B
            wandb.log({"loss": loss.item(), "epoch": epoch})

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    wandb.finish()


In [None]:
# Save the model locally
model.save_pretrained("./trained_model")

# Upload to Hugging Face
from huggingface_hub import notebook_login

notebook_login()
model.push_to_hub("your-username/your-model-name")
tokenizer.push_to_hub("your-username/your-model-name")


In [None]:
wandb.finish()