In [None]:
!pip install transformers datasets spacy huggingface_hub python-dotenv
!python -m spacy download en_core_web_sm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

import torch
from datasets import load_dataset
from transformers import pipeline
from huggingface_hub import login
import spacy
from tqdm import tqdm

import pandas as pd
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def huggingface_login(token):
    login(token=token)  # Replace with your actual token

# Step 1: Load Dataset from Hugging Face
def load_huggingface_dataset(dataset_path):
    """Load the dataset from Hugging Face Hub."""
    return load_dataset(dataset_path)

# Step 2: Initialize Captioning Models
def initialize_captioning_models():
    """Initialize multiple state-of-the-art captioning models."""
    device = 0 if torch.cuda.is_available() else -1
    models = {
        "BLIP": pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=device),
        "GIT": pipeline("image-to-text", model="microsoft/git-large", device=device),
        "VIT-GPT2": pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device),
        "BLIP-Large": pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
    }
    return models

# Step 3: Generate Captions for Each Image
def generate_captions_for_images(dataset, models):
    """Generate captions using multiple models."""
    captions = []
    for idx, image in tqdm(enumerate(dataset["image"]), total=len(dataset["image"]), desc="Generating Captions"):
        try:
            # Add the original caption to the output (assuming dataset has a 'text' field)
            original_caption = dataset[idx]["text"]
            image_captions = {"image_id": f"image_{idx}", "original_caption": original_caption}
            for model_name, model in models.items():
                image_captions[model_name] = model(image, max_new_tokens=30)[0]["generated_text"]
            captions.append(image_captions)
        except Exception as e:
            print(f"Error generating caption for image {idx}: {e}")
    return pd.DataFrame(captions)

# Step 4: Extract Adjectives Using spaCy
def extract_adjectives(caption):
    """Extract adjectives from a caption."""
    doc = nlp(caption)
    return [token.text for token in doc if token.pos_ == "ADJ"]

def extract_all_adjectives(captions):
    """Extract adjectives from all captions."""
    adjective_records = []  # Collect data for saving as a CSV
    adjectives = {}
    for _, row in tqdm(captions.iterrows(), total=len(captions), desc="Extracting Adjectives"):
        image_id = row["image_id"]
        adj_dict = {}
        for model in row.keys():
            if model not in ["image_id", "original_caption"]:
                adj_dict[model] = extract_adjectives(row[model])
        adjectives[image_id] = adj_dict
        adjective_records.append({"image_id": image_id, **adj_dict, "original_caption": row["original_caption"]})
    # # Save all captions and extracted adjectives to CSV for analysis
    # pd.DataFrame(adjective_records).to_csv("/content/drive/My Drive/Colab Notebooks/Deep Learning Final/adjectives_per_model.csv", index=False)
    return adjectives


# Step 5: Voting Mechanism for Adjectives
def vote_on_adjectives(adjective_dict, threshold=2):
    """Filter adjectives based on voting threshold."""
    final_adjectives = []
    for image, adj_dict in adjective_dict.items():
        all_adjectives = [adj for model_adjs in adj_dict.values() for adj in model_adjs]
        counts = Counter(all_adjectives)
        final_adjectives.append({
            "image_id": image,
            "voted_adjectives": [adj for adj, count in counts.items() if count >= threshold]
        })
    # # Save the voted adjectives to CSV for review
    # pd.DataFrame(final_adjectives).to_csv("/content/drive/My Drive/Colab Notebooks/Deep Learning Final/voted_adjectives.csv", index=False)
    return {item["image_id"]: item["voted_adjectives"] for item in final_adjectives}

# Step 6: Filter and Save Dataset
def save_combined_results(captions_df, adjective_dict, voted_adjectives, output_path):
    """Combine all results and save them in a single CSV file."""
    combined_data = []
    for _, row in tqdm(captions_df.iterrows(), total=len(captions_df), desc="Saving Results"):
        image_id = row["image_id"]
        combined_data.append({
            "image_id": image_id,
            "original_caption": row["original_caption"],
            "BLIP_caption": row["BLIP"],
            "BLIP-Large_caption": row["BLIP-Large"],
            "GIT_caption": row["GIT"],
            "VIT-GPT2_caption": row["VIT-GPT2"],
            "voted_adjectives": voted_adjectives.get(image_id, []),
        })
    pd.DataFrame(combined_data).to_csv(output_path, index=False)
    print(f"Combined results saved to {output_path}")

In [None]:
# Main Script
if __name__ == "__main__":
    hf_token = os.getenv("HF_TOKEN")
    huggingface_login(hf_token)

    # Step 1: Load dataset from Hugging Face
    dataset_path = "wlsdml357/coco_adj_image_caption_pair"  # Replace with your dataset path on Hugging Face
    dataset = load_huggingface_dataset(dataset_path)["test"]

    # # **Process Only the First 10 Images**
    # dataset = load_huggingface_dataset(dataset_path)["test"].select(range(10))

    # Step 2: Initialize spaCy and captioning models
    spacy.require_gpu()
    nlp = spacy.load("en_core_web_sm")
    models = initialize_captioning_models()

    # Step 3: Generate captions for images
    captions = generate_captions_for_images(dataset, models)

    # Step 4: Extract adjectives
    adjective_dict = extract_all_adjectives(captions)

    # Step 5: Conduct voting on adjectives
    threshold = 2  # Set your desired voting threshold
    voted_adjectives = vote_on_adjectives(adjective_dict, threshold=threshold)

    # Step 6: Filter dataset and save results
    output_path = "/content/drive/My Drive/Colab Notebooks/Deep Learning Final/method2_results.csv"
    save_combined_results(captions, adjective_dict, voted_adjectives, output_path)
    # filter_and_save_dataset(dataset, voted_adjectives, output_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/306 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/5.76M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/40 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Generating Captions:   0%|          | 0/40 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Generating Captions:  25%|██▌       | 10/40 [00:15<00:42,  1.43s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating Captions: 100%|██████████| 40/40 [00:56<00:00,  1.41s/it]
Extracting Adjectives: 100%|██████████| 40/40 [00:15<00:00,  2.62it/s]
Saving Results: 100%|██████████| 40/40 [00:00<00:00, 15603.81it/s]

Combined results saved to /content/drive/My Drive/Colab Notebooks/Deep Learning Final/method2_results.csv



