# Using PaliGemma2-Mix as Remotely Sourced Zoo Model

In [None]:
import fiftyone as fo

from fiftyone.utils.huggingface import load_from_hub

dataset = load_from_hub(
    "voxel51/hand-keypoints",
    name="hands_subset",
    max_samples=10,
    )

For context, here is the first image:

In [None]:
from PIL import Image

Image.open(dataset.first().filepath)

# Setup Zoo Model

In [None]:
import fiftyone.zoo as foz
foz.register_zoo_model_source("https://github.com/harpreetsahota204/paligemma2", overwrite=True)

In [None]:
foz.download_zoo_model(
    "https://github.com/harpreetsahota204/paligemma2",
    model_name="google/paligemma2-10b-mix-448", 
)

In [None]:
import fiftyone.zoo as foz
model = foz.load_zoo_model(
    "google/paligemma2-10b-mix-448"
    )

# Use PaliGemma2-Mix for Captions

The three captioning operations require no additional arguments beyond selecting the operation type. 

Supported `detail_level` values:

* `short`

*  `coco-style`

* `detailed`

In [None]:
model.operation="caption"
model.detail_level= "coco-style"

dataset.apply_model(model, label_field="coco_captions")

In [None]:
dataset.first()['coco_captions']

To change the caption detail level:

In [None]:
model.detail_level= "detailed"

dataset.apply_model(model, label_field="detailed_captions")

dataset.first()['detailed_captions']

# Use PaliGemma2-Mix for Detection

The operations for `detection`, `dense_region_caption`, `region_proposal` don't require additional parameters for general use. 

However, `open_vocabulary_detection` requires a `text_prompt` parameter to guide the detection towards specific objects. 

The results are stored as Detections objects containing bounding boxes and labels:

In [None]:
model.operation="detection"

model.prompt=["person", "bookshelf"] # you can also pass in a string like "horse; grass; train; sheep; home"

dataset.apply_model(model, label_field="detection_results")

dataset.first()['detection_results']

# Use PaliGemm2-Mix for Segmentation

Segmentation requires either a direct expression or a reference to a field containing expressions. 

Similar to phrase grounding, you can provide this in two ways:

In [None]:
model.operation="segment"

model.prompt= ["person relaxing", "person playing sports", "person talking"] #could pass a list of strings or a string delimited by , or ;

dataset.apply_model(model, label_field="segment_results")

In [None]:
dataset.first()

In [None]:
dataset.first()['segment_results']['detections'][0]['mask']

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
mask = dataset.first()['segment_results']['detections'][0]['mask']
mask_image = Image.fromarray(mask)
# mask_image = mask_image.resize((mask.shape[1], mask.shape[0]))  # Resize to match input mask dimensions
mask_image.save('segmentation_mask.png')


In [None]:
mask_image.size

In [None]:
mask.shape

In [21]:
dataset.first()['metadata']

# Use PaliGemm2-Mix for OCR



In [None]:
model.operation="ocr"

dataset.apply_model(model, label_field="text")

In [None]:
dataset.first()['text']

# Use PaliGemm2-Mix for Zero-Shot Classification

In [None]:
model.operation="classify"

model.prompt=["a person doing yoga", "a person playing sports", "a person talking to someone", "people working"]

dataset.apply_model(model, label_field="classify_results")

In [None]:
dataset.first()['classify_results']

# Use PaliGemm2-Mix for Answering Questions

Note: This will parse output as a FiftyOne Classification

In [None]:
model.operation="answer"

model.prompt="What activity are the people doing?"

dataset.apply_model(model, label_field="answer_results")

In [None]:
dataset.first()['answer_results']