In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from tqdm import tqdm
import pandas as pd
from diffusers import DiffusionPipeline
import torch
import os
import argparse

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [4]:
!pip install accelerate --quiet

In [None]:
# !pip install diffusers==0.25.0 transformers accelerate torch --quiet

In [5]:
SD_KWARGS = {
    'guidance_scale': 10, # Controls how much the generation should follow the input prompt.
    'num_inference_steps': 40, # Number of steps in the diffusion process (higher means better quality but slower).
    'negative_prompt': "unclear, deformed, out of image, disfiguired, body out of frame" # Words to guide the model away from undesirable outputs
}

In [6]:
def generate_dataset(args):
    data = pd.read_csv(args.data_file)
    # # Use only the first 3 samples
    # data = data.head(100)  # Select only the top 3 rows from the dataset

    from accelerate import Accelerator
    accelerator = Accelerator()

    pipe = DiffusionPipeline.from_pretrained(args.model_id, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", cache_dir=args.cache_dir)
    pipe.to(accelerator.device)

    # pipe.to(args.device)
    pipe.set_progress_bar_config(disable=True)
    BS = args.batch_size
    data_ = data.set_index(data.index // BS).copy()

    sd_kwargs = SD_KWARGS.copy()
    sd_kwargs["negative_prompt"] = [sd_kwargs["negative_prompt"]] * BS

    start_idx = 0  # 798 (why???) maybe this was a memory issue
    j = start_idx
    for i in tqdm(range(start_idx, data_.index.max()+1)):
        if BS == 1:
            prompts = [data_.loc[i].Captions]  # generated_caption
        else:
            prompts = data_.loc[i].Captions.tolist()  # generated_caption

        out = pipe(
            prompts,
            **sd_kwargs,
            seed=args.seed,
        )
        for img in out.images:
        #for img, img_id in zip(out.images, image_ids):
            fn = args.output_dir + str(j).rjust(6, '0') + '.jpg'
            img.save(fn)
            j += 1

        del out
        torch.cuda.empty_cache()  # Free memory after each batch

In [7]:
if __name__ == "__main__":
    """
    Generates the images from input CSV with captions
    """
    import sys
    sys.argv = [sys.argv[0]]  # Filter out Jupyter/Colab arguments

    folder_name = '/content/drive/My Drive/Colab Notebooks/Deep Learning Final'
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-id", type=str, default='stabilityai/stable-diffusion-xl-base-1.0')
    parser.add_argument("--device", type=str, default='cuda')
    parser.add_argument("--batch-size", type=int, default=1)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--cache-dir", type=str, default=os.path.join(folder_name, 'cache_dir'))  # Model cache directory (model takes up ~7-8GB in location)
    # parser.add_argument("--data-file", type=str, default=os.path.join(folder_name, 'data', 'filtered_verbs_llm.csv'))
    parser.add_argument("--data-file", type=str, default=os.path.join(folder_name, 'coco_adj_seed.csv'))
    parser.add_argument("--output-dir", type=str, default=os.path.join(folder_name, 'images/'))
    args = parser.parse_args()

    generate_dataset(args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

scheduler/scheduler_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

text_encoder_2/config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

tokenizer_2/tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

tokenizer_2/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/5.14G [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 40/40 [25:14<00:00, 37.85s/it]


# Upload image-caption pairs to Huggingface

In [9]:
!pip install datasets
!pip install python-dotenv



In [10]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from PIL import Image
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()

False

In [11]:
# Step 1: Log in to Hugging Face
def huggingface_login(token):
    login(token=token)  # Replace with your actual token


# Step 2: Load the CSV file
def load_csv(csv_path):
    return pd.read_csv(csv_path)


# Step 3: Load images based on the DataFrame index (loc)
def load_image(example, folder_name, idx):
    # Adjust indexing to match image filenames
    image_path = os.path.join(folder_name, 'images', f'{idx:06d}.jpg')
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"Image not found at path: {image_path}")
    return {"image": image}


# Step 4: Create Hugging Face Dataset with only the image and caption (no extra columns)
def create_dataset(df, folder_name, max_images=None):
    df = df[['Captions']]  # Select only the caption column

    # Limit the dataset to match the number of available images
    if max_images:
        df = df.iloc[:max_images]  # Only take rows for which images exist

    dataset = Dataset.from_pandas(df)

    # Map function with the index from df, keeping only the image and caption
    dataset = dataset.map(lambda example, idx: {**load_image(example, folder_name, idx), "text": example["Captions"]},
                          with_indices=True)

    # Remove the 'Caption' column
    dataset = dataset.remove_columns(['Captions'])
    return dataset


# Step 5: Save dataset to Hugging Face Hub
def push_to_hub(dataset, username, dataset_name):
    dataset_dict = DatasetDict({"test": dataset})
    dataset_dict.push_to_hub(f"{username}/{dataset_name}")

In [12]:
# Main script
if __name__ == "__main__":
    # Step 1: Log in to Hugging Face
    hf_token = os.getenv("HF_TOKEN")
    huggingface_login(hf_token)

    # Step 2: Load CSV file containing captions
    folder_name = '/content/drive/My Drive/Colab Notebooks/Deep Learning Final'
    csv_file = os.path.join(folder_name, 'coco_adj_seed.csv')
    df = load_csv(csv_file)

    # Step 3: Create Hugging Face Dataset
    max_images = None  # Adjust this to the number of images you currently have
    dataset = create_dataset(df, folder_name, max_images=max_images)

    # Inspect the dataset before uploading
    for i, entry in enumerate(dataset):
        print(f"Entry {i}: {entry}")
        if i >= 5:  # Only show the first 5 for brevity
            break

    # Step 4: Push dataset to Hugging Face Hub
    username = "wlsdml357"  # Replace with your Hugging Face username
    dataset_name = "coco_adj_image_caption_pair"  # Replace with the desired dataset name
    # Uncomment the next line only when you're ready to push
    push_to_hub(dataset, username, dataset_name)

    print(f"Dataset is ready to be pushed to Hugging Face Hub: {username}/{dataset_name}")

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Entry 0: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x78409103E4D0>, 'text': 'A coral-hued bicycle rests half-buried in golden sand under the shade of a twisted obsidian tree.'}
Entry 1: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x78409103D750>, 'text': 'A patchwork kite flutters gently across an amber sky, tethered to a towering, emerald-green windmill.'}
Entry 2: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x78409103C8E0>, 'text': 'A polished brass coffee mug sits forgotten on a floating mahogany plank above still, violet water.'}
Entry 3: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x78409103FAF0>, 'text': 'A sapphire-hued bird with crystalline wings hovers silently over a bed of golden roses.'}
Entry 4: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x78409103D510>, 'text': 'A weathered wooden violin lies among sil

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset is ready to be pushed to Hugging Face Hub: wlsdml357/coco_adj_image_caption_pair
