In [None]:
!pip install git+https://github.com/huggingface/transformers.git#egg=transformers

In [None]:
!pip install fiftyone umap-learn

In [4]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

In [None]:
dataset = load_from_hub(
    "harpreetsahota/IllusionAnimals",
    overwrite=True
    )

In [None]:
class_names = dataset.distinct("label.label")

In [7]:
import os

os.environ['FIFTYONE_ALLOW_LEGACY_ORCHESTRATORS'] = 'true'

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/aimv2_embeddings

In [None]:
!fiftyone plugins requirements @harpreetsahota/aimv2_embeddings --install

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/hiera-embeddings-plugin

In [None]:
!fiftyone plugins requirements @harpreetsahota/hiera_embeddings --install

### Computing Embeddings

In [None]:
import torch 

import fiftyone.zoo as foz

siglip_model = foz.load_zoo_model(
    "zero-shot-classification-transformer-torch",
    name_or_path="google/siglip2-base-patch16-512", 
    classes=class_names,
    device="cuda" if torch.cuda.is_available() else "cpu"
    )

In [None]:
dataset.compute_embeddings(
    model=siglip_model,
    embeddings_field="siglip_emb"
)

In [None]:
import fiftyone.operators as foo

aim_embeddings = foo.get_operator("@harpreetsahota/aimv2_embeddings/compute_aimv2_embeddings")

In [None]:
# Run the operator on your dataset
await aim_embeddings(
    dataset,
    model_name="apple/aimv2-large-patch14-224",  # Choose any supported model
    embedding_types="mean",
    emb_field="aimv2_mean_embeddings",
    delegate=True
)

In [None]:
# Run the operator on your dataset
await aim_embeddings(
    dataset,
    model_name="apple/aimv2-large-patch14-224",  # Choose any supported model
    embedding_types="cls",
    emb_field="aimv2_cls_embeddings",
    delegate=True
)

In [None]:
import fiftyone.brain as fob

embedding_fields = [ 
    "aimv2_mean_embeddings",
    "aimv2_cls_embeddings",
    "siglip_emb"
    ]

for fields in embedding_fields:
    _fname = fields.split("_embeddings")[0]
    results = fob.compute_visualization(
        dataset,
        embeddings=fields,
        method="umap",
        brain_key=f"{_fname}_viz",
        num_dims=2,
        )

### Zero-shot classification using Siglip and aimv2

In [None]:
dataset.apply_model(
    model=siglip_model, 
    label_field="siglip2_predictions",
    )

In [None]:
aimv2_model = foz.load_zoo_model(
    "zero-shot-classification-transformer-torch",
    name_or_path="apple/aimv2-large-patch14-224-lit", 
    classes=class_names,
    trust_remote_code=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
    )

In [None]:
dataset.apply_model(
    model=aimv2_model, 
    label_field="aimv2_predictions",
    )

In [None]:
fo.launch_app(dataset)

Evaluate classifications and see the results


### Can VLMs do any better?

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/janus-vqa-fiftyone

!fiftyone plugins requirements @harpreetsahota/janus_vqa --install

!fiftyone plugins download https://github.com/harpreetsahota204/moondream2-plugin

!fiftyone plugins requirements @harpreetsahota/moondream2 --install

In [None]:
NO_HINT_PROMPT = f"""Which class is in the picture: {', '.join(class_names)}. 
Your answer must be one of these exact classes, no other answers allowed. 
Respond in one word for your guess of the correct class without any extra explanation."""



In [None]:
import fiftyone.operators as foo

janus_vqa = foo.get_operator("@harpreetsahota/janus_vqa/janus_vqa")

moondream = foo.get_operator("@harpreetsahota/moondream2/moondream")

In [6]:
await janus_vqa(
    dataset,
    model_path="deepseek-ai/Janus-Pro-1B",
    question=NO_HINT_PROMPT,
    question_field="no_hint_prompt",
    answer_field="janus_no_hint_answer",
    delegate=True
    )

In [None]:
await moondream(
    dataset,
    revision="2025-01-09",
    operation="query",
    output_field="moondream_no_hint_answer",
    query_text=NO_HINT_PROMPT,
    delegate=True
    )

In [24]:
HINT_PROMPT = f"""There might be an image illusion of something in this image. 
These are the classes that the image illusion might belong to: {', '.join(class_names)}.
Your answer must be one of these exact classes, no other answers allowed.  
Respond in one word for your guess of the correct class without any extra explanation.
"""

In [19]:
await janus_vqa(
    dataset,
    model_path="deepseek-ai/Janus-Pro-1B",
    question=HINT_PROMPT,
    question_field="hint_prompt",
    answer_field="janus_hint_answer",
    delegate=True
    )

In [None]:
await moondream(
    dataset,
    revision="2025-01-09",
    operation="query",
    output_field="moondream_hint_answer",
    query_text=HINT_PROMPT,
    delegate=True
    )

Moondream2 also produces short captions, let's generate short captions and then compute similarity between the caption and the ground truth prompt

Then let's also see if any of the captions actually include the classes of interest