In [24]:
import torch
import spacy
from transformers import CLIPModel, CLIPProcessor
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd
import numpy as np

In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [38]:
# Load CLIP Model and Processor
def load_clip_model(device):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    model.to(device)
    return model, processor

# Compute CLIP Embedding for Image
def get_image_embedding(model, processor, image, device):
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_embedding = model.get_image_features(**inputs)
    return image_embedding.cpu().numpy()

# Compute CLIP Embedding for Text
def get_text_embedding(model, processor, text, device):
    inputs = processor(text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        text_embedding = model.get_text_features(**inputs)
    return text_embedding.cpu().numpy()

# Calculate Cosine Similarity
def calculate_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1, embedding2)[0][0]

# Extract adjectives from a given caption
spacy.require_gpu()
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def extract_adjectives(caption):
    doc = nlp(caption)
    return [token.text for token in doc if token.pos_ == "ADJ"]
    #return list(set([token.text for token in doc if token.pos_ == "ADJ"]))

# Compare Adjective-Modified Captions
def compare_adjective_embeddings(image_id, original_caption, generated_caption, adjectives, image_embedding, model, processor, device):
    """
    Compare angular distance (angle) of image embedding with each adjective vs. adjective-free text embeddings.
    Returns a dictionary with image_id, original caption, adjectives, and hallucination status for the caption.
    """
    adjectives_list = []
    hallucinated_list = []

    for adjective in adjectives:
        # Remove the selected adjective from the caption
        pattern = r'\b' + re.escape(adjective) + r'\b'
        adjective_free_caption = re.sub(pattern, '', generated_caption).strip()

        # Get embeddings for the generated and adjective-free captions
        generated_embedding = get_text_embedding(model, processor, generated_caption, device)
        adjective_free_embedding = get_text_embedding(model, processor, adjective_free_caption, device)

        # Calculate cosine similarities
        sim_with_adjective = calculate_similarity(image_embedding, generated_embedding)
        sim_without_adjective = calculate_similarity(image_embedding, adjective_free_embedding)

        # Convert cosine similarity to angular distance (in radians)
        angle_with_adjective = np.arccos(sim_with_adjective)
        angle_without_adjective = np.arccos(sim_without_adjective)

        # Determine if adjective is hallucinated
        hallucinated = angle_with_adjective > angle_without_adjective

        # Append the adjective and hallucination result to the lists
        adjectives_list.append(adjective)
        hallucinated_list.append(hallucinated)

    # Create a dictionary for the image_id, original caption, adjectives, and hallucinated status
    result = {
        "image_id": image_id,
        "original_caption": original_caption,
        "generated_caption": generated_caption,
        "adjectives": f"[{', '.join(adjectives_list)}]",  # Format adjectives as a list
        "hallucinated": f"[{', '.join([str(hall) for hall in hallucinated_list])}]"  # Format hallucinated status as a list
    }

    return result


In [39]:
# Load Hugging Face Dataset for both images and captions
def load_hf_image_and_caption_datasets(image_dataset_name="wlsdml357/OpenCHAIR_Adjective", caption_dataset_name="wlsdml357/OpenCHAIR_Adjective_Captions"):
    image_dataset = load_dataset(image_dataset_name)
    caption_dataset = load_dataset(caption_dataset_name)
    return image_dataset, caption_dataset

# Process Images and Captions
def process_dataset(image_dataset, caption_dataset, model, processor, device, caption_column_name="text", output_file_name="results.csv"):
    results = []
    for idx in range(len(caption_dataset["train"])):
        image_id = caption_dataset["train"][idx]["image_id"]
        original_caption = caption_dataset["train"][idx]["original_caption"]
        image = image_dataset["test"][idx]["image"]
        generated_caption = caption_dataset["train"][idx][caption_column_name]

        # Extract adjectives from the caption
        adjectives = extract_adjectives(generated_caption)

        # Get image embedding once for each image
        image_embedding = get_image_embedding(model, processor, image, device)

        # Compare each adjective and check if it's hallucinated
        result = compare_adjective_embeddings(image_id, original_caption, generated_caption, adjectives, image_embedding, model, processor, device)
        results.append(result)

        print(f"Processed {idx+1}/{len(image_dataset['test'])}")

    save_results_to_csv(results, output_file_name)

    return results

# Save Results to CSV
def save_results_to_csv(results, output_file_name="results.csv"):
    df = pd.DataFrame(results)
    df.to_csv(output_file_name, index=False)
    print(f"Results saved to {output_file_name}")

In [40]:
if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, processor = load_clip_model(device)
    image_dataset, caption_dataset = load_hf_image_and_caption_datasets()
    caption_column_name = "BLIP_caption"
    output_file_name = "method3_BLIP_caption_results.csv"
    results = process_dataset(image_dataset, caption_dataset, model, processor, device, caption_column_name, output_file_name)


Processed 1/40
Processed 2/40
Processed 3/40
Processed 4/40
Processed 5/40
Processed 6/40
Processed 7/40
Processed 8/40
Processed 9/40
Processed 10/40
Processed 11/40
Processed 12/40
Processed 13/40
Processed 14/40
Processed 15/40
Processed 16/40
Processed 17/40
Processed 18/40
Processed 19/40
Processed 20/40
Processed 21/40
Processed 22/40
Processed 23/40
Processed 24/40
Processed 25/40
Processed 26/40
Processed 27/40
Processed 28/40
Processed 29/40
Processed 30/40
Processed 31/40
Processed 32/40
Processed 33/40
Processed 34/40
Processed 35/40
Processed 36/40
Processed 37/40
Processed 38/40
Processed 39/40
Processed 40/40
Results saved to method3_BLIP_caption_results.csv


In [41]:
if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, processor = load_clip_model(device)
    image_dataset, caption_dataset = load_hf_image_and_caption_datasets()
    caption_column_name = "BLIP-Large_caption"
    output_file_name = "method3_BLIP-Large_caption_results.csv"
    results = process_dataset(image_dataset, caption_dataset, model, processor, device, caption_column_name, output_file_name)


Processed 1/40
Processed 2/40
Processed 3/40
Processed 4/40
Processed 5/40
Processed 6/40
Processed 7/40
Processed 8/40
Processed 9/40
Processed 10/40
Processed 11/40
Processed 12/40
Processed 13/40
Processed 14/40
Processed 15/40
Processed 16/40
Processed 17/40
Processed 18/40
Processed 19/40
Processed 20/40
Processed 21/40
Processed 22/40
Processed 23/40
Processed 24/40
Processed 25/40
Processed 26/40
Processed 27/40
Processed 28/40
Processed 29/40
Processed 30/40
Processed 31/40
Processed 32/40
Processed 33/40
Processed 34/40
Processed 35/40
Processed 36/40
Processed 37/40
Processed 38/40
Processed 39/40
Processed 40/40
Results saved to method3_BLIP-Large_caption_results.csv


In [42]:
if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, processor = load_clip_model(device)
    image_dataset, caption_dataset = load_hf_image_and_caption_datasets()
    caption_column_name = "GIT_caption"
    output_file_name = "method3_GIT_caption_results.csv"
    results = process_dataset(image_dataset, caption_dataset, model, processor, device, caption_column_name, output_file_name)


Processed 1/40
Processed 2/40
Processed 3/40
Processed 4/40
Processed 5/40
Processed 6/40
Processed 7/40
Processed 8/40
Processed 9/40
Processed 10/40
Processed 11/40
Processed 12/40
Processed 13/40
Processed 14/40
Processed 15/40
Processed 16/40
Processed 17/40
Processed 18/40
Processed 19/40
Processed 20/40
Processed 21/40
Processed 22/40
Processed 23/40
Processed 24/40
Processed 25/40
Processed 26/40
Processed 27/40
Processed 28/40
Processed 29/40
Processed 30/40
Processed 31/40
Processed 32/40
Processed 33/40
Processed 34/40
Processed 35/40
Processed 36/40
Processed 37/40
Processed 38/40
Processed 39/40
Processed 40/40
Results saved to method3_GIT_caption_results.csv


In [43]:
if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, processor = load_clip_model(device)
    image_dataset, caption_dataset = load_hf_image_and_caption_datasets()
    caption_column_name = "VIT-GPT2_caption"
    output_file_name = "method3_VIT-GPT2_caption_results.csv"
    results = process_dataset(image_dataset, caption_dataset, model, processor, device, caption_column_name, output_file_name)


Processed 1/40
Processed 2/40
Processed 3/40
Processed 4/40
Processed 5/40
Processed 6/40
Processed 7/40
Processed 8/40
Processed 9/40
Processed 10/40
Processed 11/40
Processed 12/40
Processed 13/40
Processed 14/40
Processed 15/40
Processed 16/40
Processed 17/40
Processed 18/40
Processed 19/40
Processed 20/40
Processed 21/40
Processed 22/40
Processed 23/40
Processed 24/40
Processed 25/40
Processed 26/40
Processed 27/40
Processed 28/40
Processed 29/40
Processed 30/40
Processed 31/40
Processed 32/40
Processed 33/40
Processed 34/40
Processed 35/40
Processed 36/40
Processed 37/40
Processed 38/40
Processed 39/40
Processed 40/40
Results saved to method3_VIT-GPT2_caption_results.csv
