# From Title to Thumbnail: Baseline 1 Stable Diffusion Text-to-Image Model

Hugging Face pre-trained Text-to-Image model finetuned on Youtube Thumbnail Dataset

References:
Pre-trained Model - https://huggingface.co/docs/diffusers/training/text2image

Datasets:
https://www.kaggle.com/datasets/praneshmukhopadhyay/youtubers-saying-things
https://www.kaggle.com/datasets/praneshmukhopadhyay/youtube-thumbnail-dataset/data

I. Prerequisites

In [None]:
!pip install huggingface_hub --upgrade

from huggingface_hub import login

login()

In [None]:
!pip install git+https://github.com/huggingface/diffusers.git
!pip install --upgrade diffusers transformers scipy accelerate datasets huggingface_hub torchvision ftfy Jinja2
!nvidia-smi

from diffusers import StableDiffusionPipeline

# device = 'cuda' if torch.cuda.is_available() else 'cpu'

II. Dataset Preparation

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

In [None]:
# # Used previously to convert Jasmine's train.csv -> metadata.csv
# import pandas as pd

# # Read the CSV file
# df = pd.read_csv('baseline/train.csv')

# # Select and rename columns
# df = df[['Id', 'Title']]
# df.columns = ['file_name', 'text']

# # Modify the file_name column
# df['file_name'] = df['file_name'].astype(str) + '.jpg'

# # Save to new CSV file
# df.to_csv('baseline/metadata.csv', index=False)

In [None]:
%cd gdrive/MyDrive

from datasets import load_dataset
train_dataset = load_dataset('imagefolder', data_dir='train_baseline')

In [None]:
# Check if dataset import is formatted correctly

from IPython.display import display
display(train_dataset["train"][0]["image"])

print(train_dataset["train"][0]["text"])

In [None]:
train_dataset.push_to_hub("lakong/yt-thumbnails-train")

III. Script for Training

IV. Running the model

In [3]:
MODEL_NAME = "CompVis/stable-diffusion-v1-4"
OUTPUT_DIR = "lauren_finetuned_model"
TRAIN_DIR = "lakong/yt-thumbnails-train"

In [4]:
from accelerate.utils import write_basic_config

write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [5]:
import torch
torch.cuda.empty_cache()
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32

In [None]:
!accelerate launch train_text_to_image.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --dataset_name=$TRAIN_DIR \
  --use_ema \
  --resolution=128 --center_crop --random_flip \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --gradient_checkpointing \
  --max_train_steps=1201 \
  --learning_rate=1e-05 \
  --max_grad_norm=1 \
  --lr_scheduler="constant" --lr_warmup_steps=0 \
  --checkpointing_steps=100 \
  --output_dir=$OUTPUT_DIR \
  --snr_gamma=5.0 \
  --resume_from_checkpoint="checkpoint-1200" \
  --push_to_hub

IV. Inference

In [None]:
!pip install transformers
!pip install diffusers
!pip install accelerate

from huggingface_hub import notebook_login

# This line will prompt you to enter your token interactively to avoid exposing it in the notebook.
notebook_login()

from transformers import pipeline
from diffusers import DiffusionPipeline
import torch

# Load your pretrained model
# Replace 'model_name' with the actual name of the model you want to load
model_name = 'lakong/lauren_finetuned_model'
pipe = DiffusionPipeline.from_pretrained(model_name)

import pandas as pd
import os
from PIL import Image  # For saving images

# Load data from CSV
df = pd.read_csv('/content/drive/MyDrive/236/test.csv').set_index('Id')

# Directory to save thumbnails
save_dir = '/content/drive/MyDrive/236/test_thumbnails_stable_diffusion_baseline1'
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

titles_dict = pd.read_csv('/content/drive/MyDrive/236/test.csv').set_index('Id').to_dict()['Title']

for key in titles_dict:
    title = titles_dict[key]
    full_prompt = f"Youtube thumbnail with title of {title}."

    # Generate one image
    image = pipe(full_prompt, num_images_per_prompt=1, num_inference_steps=25, guidance_scale=9).images[0]

    # Save the image
    image_filename = os.path.join(save_dir, f"{key}.png")
    image.save(image_filename)

print("All thumbnails generated and saved.")

In [None]:
# Generate individual thumbnails

import pandas as pd
import os
from PIL import Image  # For saving images

# Directory to save thumbnails
save_dir = '/content/drive/MyDrive/236/test_thumbnails_stable_diffusion_baseline1'
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

ids = ["OYCvbBijvQM", "oAADFdfa-G4", "dDagv6SA8nw", "GTW8IplsKmM"]

df = pd.read_csv('/content/drive/MyDrive/236/test.csv')
filtered_df = df[df['Id'].isin(ids)]

# Convert the filtered DataFrame to a dictionary
titles_dict = filtered_df.set_index('Id')['Title'].to_dict()

for key in titles_dict:
    title = titles_dict[key]
    full_prompt = f"Youtube thumbnail with title of {title}."

    # Generate one image
    image = pipe(full_prompt, num_images_per_prompt=1, num_inference_steps=25, guidance_scale=9).images[0]

    # Save the image
    image_filename = os.path.join(save_dir, f"{key}.png")
    image.save(image_filename)

print("Thumbnail generated and saved.")

V. Run Evaluations

In [None]:
import numpy as np
import os
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.preprocessing import image
from scipy.linalg import sqrtm
from sklearn.metrics.pairwise import polynomial_kernel

# Load the InceptionV3 model
model = InceptionV3(include_top=False, pooling='avg', input_shape=(299, 299, 3))

def load_and_preprocess_images(directory):
    images = []
    for filename in os.listdir(directory):
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            img_path = os.path.join(directory, filename)
            img = image.load_img(img_path, target_size=(299, 299))
            img = image.img_to_array(img)
            img = np.expand_dims(img, axis=0)
            img = preprocess_input(img)
            images.append(img)
    return np.vstack(images)

def calculate_fid(model, real_images, fake_images):
    # Calculate activations
    act1 = model.predict(real_images)
    act2 = model.predict(fake_images)

    # Calculate mean and covariance with a small regularization term
    epsilon = 1e-6
    mu1, sigma1 = act1.mean(axis=0), np.cov(act1, rowvar=False) + epsilon * np.identity(act1.shape[1])
    mu2, sigma2 = act2.mean(axis=0), np.cov(act2, rowvar=False) + epsilon * np.identity(act2.shape[1])

    # Compute the FID
    ssdiff = np.sum((mu1 - mu2) ** 2.0)
    covmean, _ = sqrtm(sigma1.dot(sigma2), disp=False)
    if np.iscomplexobj(covmean):
        covmean = covmean.real

    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    fid = max(ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean), 0)
    return fid

def calculate_kid(model, real_images, fake_images):
    # Calculate activations
    act1 = model.predict(real_images)
    act2 = model.predict(fake_images)

    # Compute kernel matrix
    kernel_real = polynomial_kernel(act1)
    kernel_fake = polynomial_kernel(act2)
    kernel_real_fake = polynomial_kernel(act1, act2)

    # Compute the KID
    m = kernel_real.shape[0]
    n = kernel_fake.shape[0]
    kid = np.mean(kernel_real) + np.mean(kernel_fake) - 2 * np.mean(kernel_real_fake)
    return kid

# Load your images
real_images = '/content/drive/MyDrive/236/test_images' #@param
fake_images = '/content/drive/MyDrive/236/test_thumbnails_stable_diffusion_baseline1' #@param
real_images = load_and_preprocess_images(real_images)
fake_images = load_and_preprocess_images(fake_images)

fid = calculate_fid(model, real_images, fake_images)
kid = calculate_kid(model, real_images, fake_images)

print("FID:", fid)
print("KID:", kid)

fid = calculate_fid(model, real_images, real_images)
kid = calculate_kid(model, real_images, real_images)

print("FID:", fid)
print("KID:", kid)

V. Save Model to HuggingFace for Future Reference

In [None]:
from huggingface_hub import hf_api
api = HfApi()

api.upload_folder(
    folder_path="lauren_finetuned_model",
    repo_id="lakong/colab_results",
    repo_type="space",
)

In [None]:
pipe.push_to_hub("lauren_finetuned_model")