In [1]:
!pip install transformers spacy diffusers
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m109.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `first_token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `first_token`

In [3]:
import torch
torch.cuda.empty_cache()

In [None]:
# Implementation for Generating Diverse Captions

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, CLIPTextModel, CLIPTokenizer
import random
import spacy

# Step 1: Parse MS-COCO captions and identify adjectives
nlp = spacy.load("en_core_web_sm")
def extract_adjectives(captions):
    adjectives = {}
    for caption in captions:
        doc = nlp(caption)
        for token in doc:
            if token.pos_ == "ADJ":
                adjectives[token.text] = adjectives.get(token.text, 0) + 1
    return adjectives

# Step 2: Filter adjectives by concreteness (using a threshold of 4.5 from Hessel et al., 2018)
def filter_concrete_adjectives(adjectives, concreteness_scores, threshold=4.5):
    return {adj: score for adj, score in concreteness_scores.items() if adj in adjectives and score >= threshold}

# Step 3: Rephrase captions with diverse adjectives
instruction_tuned_llm = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", device=0)
def rephrase_captions(captions, adjectives):
    diverse_captions = []
    for caption in captions:
        prompt = f"Rephrase the caption '{caption}' using the adjective '{random.choice(list(adjectives.keys()))}'."
        diverse_caption = instruction_tuned_llm(prompt, top_p=0.9, temperature=0.6, max_length=50)[0]['generated_text']
        diverse_captions.append(diverse_caption)
    print("Diverse captions")
    print(diverse_captions)
    return diverse_captions

# Step 4: Filter captions with rare adjectives
def filter_rare_adjectives(captions, adjectives, rarity_threshold=0.1):
    adjective_frequency = sorted(adjectives.values())
    rarity_cutoff = adjective_frequency[int(len(adjective_frequency) * rarity_threshold)]
    rare_adjectives = {adj for adj, freq in adjectives.items() if freq <= rarity_cutoff}
    filtered_captions = [caption for caption in captions if any(adj in caption for adj in rare_adjectives)]
    return filtered_captions

# Step 5: Generate new captions using the same pipeline
def generate_new_captions(few_shot_captions, num_captions=20):
    diverse_captions = []
    for caption in few_shot_captions:
        prompt = f"Generate {num_captions} diverse captions inspired by this: '{caption}'."
        # Use the correct call to the pipeline to get the generated captions
        generated_text = instruction_tuned_llm(prompt, top_p=0.9, temperature=0.8, max_new_tokens=50, num_return_sequences=num_captions)
        # Extend the list with the generated captions
        diverse_captions.extend([res['generated_text'] for res in generated_text])
    print("Diverse captions 2")
    print(diverse_captions)
    return diverse_captions

# Step 6: Generate images using Stable Diffusion XL
from diffusers import StableDiffusionPipeline

def generate_images(captions):
    pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl")
    pipe.to("cuda")

    # Load text encoder for embeddings
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda")

    pipe.scheduler.num_inference_steps = 40
    pipe.scheduler.guidance_scale = 10

    generated_images = []

    for caption in captions:
        # Generate text embeddings
        inputs = tokenizer(caption, return_tensors="pt", padding="max_length", truncation=True).to("cuda")
        text_embeds = text_encoder(**inputs).pooler_output

        # Pass text embeddings as part of added_cond_kwargs
        generated_images.append(pipe(
            caption,
            negative_prompt="unclear, deformed, out of image, disfigured, body out of frame",
            added_cond_kwargs={"text_embeds": text_embeds}
        ))

    return generated_images

# Example workflow:
# # Load MS-COCO captions (placeholder, replace with actual loading)
# captions = ["A man riding a horse on a sunny day.", "A woman sitting on a bench in a park.", "A cat lying on a carpet next to a table."]

# # Example concreteness scores (replace with actual scores)
# concreteness_scores = {"rainy": 4.8, "chair": 3.2, "mattress": 4.7, "behind": 2.5, "frog": 4.9, "rabbit": 4.6}

# adjectives = extract_adjectives(captions)
# concrete_adjectives = filter_concrete_adjectives(adjectives, concreteness_scores)

# rephrased_captions = rephrase_captions(captions, concrete_adjectives)
# rare_captions = filter_rare_adjectives(rephrased_captions, adjectives)

# # Few-shot captions for base LLM
# generated_captions = generate_new_captions(rare_captions)

generated_captions = ["A man riding a horse on a sunny day.", "A woman sitting on a bench in a park.", "A cat lying on a carpet next to a table."]
# Generate images from captions
generated_images = generate_images(generated_captions)

# Outputs are the generated images and captions


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

text_encoder_2/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/602 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.78G [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer_2/tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

tokenizer_2/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/10.3G [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]