Testing implemention of Florence2 as FO Model

In [None]:
!pip install einops timm

In [1]:
import sys
sys.path.append(".")
from florence2 import Florence2, run_florence2_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import fiftyone as fo
import fiftyone.zoo as foz

dataset = foz.load_zoo_dataset("quickstart")

Dataset already downloaded
Loading existing dataset 'quickstart'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use


In [3]:
MODEL_PATH ="microsoft/Florence-2-base-ft"

#### Captioning

```python

    "caption": {
        "params": {"detail_level": ["basic", "detailed", "more_detailed"]},
        "required": [],
        "task_mapping": {
            "detailed": "<DETAILED_CAPTION>",
            "more_detailed": "<MORE_DETAILED_CAPTION>",
            "basic": "<CAPTION>",
            None: "<CAPTION>"  # Default value
```

In [4]:
run_florence2_model(
    dataset,
    model_path=MODEL_PATH,
    operation="caption",
    detail_level="basic",
    output_field="basic_caption",
    )

Using device: mps


Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


 100% |█████████████████| 200/200 [59.0s elapsed, 0s remaining, 3.5 samples/s]      


In [5]:
run_florence2_model(
    dataset,
    model_path=MODEL_PATH,
    operation="caption",
    detail_level="detailed",
    output_field="detailed_caption",
    )

Using device: mps
 100% |█████████████████| 200/200 [2.4m elapsed, 0s remaining, 1.4 samples/s]      


In [6]:
run_florence2_model(
    dataset,
    model_path=MODEL_PATH,
    operation="caption",
    detail_level="more_detailed",
    output_field="more_detailed_caption",
    )

Using device: mps
 100% |█████████████████| 200/200 [3.1m elapsed, 0s remaining, 1.4 samples/s]      


In [9]:
dataset.first()

<Sample: {
    'id': '67dda8b4de3b57ba0da3dc35',
    'media_type': 'image',
    'filepath': '/Users/harpreetsahota/fiftyone/quickstart/data/000880.jpg',
    'tags': ['validation'],
    'metadata': None,
    'created_at': datetime.datetime(2025, 3, 21, 17, 58, 12, 429000),
    'last_modified_at': datetime.datetime(2025, 3, 21, 20, 20, 5, 540000),
    'ground_truth': <Detections: {
        'detections': [
            <Detection: {
                'id': '5f452471ef00e6374aac53c8',
                'attributes': {},
                'tags': [],
                'label': 'bird',
                'bounding_box': [0.21084375, 0.0034375, 0.46190625, 0.9442083333333334],
                'mask': None,
                'mask_path': None,
                'confidence': None,
                'index': None,
                'area': 73790.37944999996,
                'iscrowd': 0.0,
            }>,
            <Detection: {
                'id': '5f452471ef00e6374aac53c9',
                'attributes': {},


##### Phrase grounding


```python
    "phrase_grounding": {
        "params": {"caption_field": str, "caption": str},
        "required": [],  # Will be validated in code
        "task": "<CAPTION_TO_PHRASE_GROUNDING>"
    },
````

In [13]:
run_florence2_model(
    dataset,
    model_path=MODEL_PATH,
    operation="phrase_grounding",
    caption_field="detailed_caption",
    output_field="florence_phrase_grounding",
    )

Using device: mps


In [None]:
run_florence2_model(
    dataset,
    operation="phrase_grounding",
    model_path=MODEL_PATH,
    caption="The inanimate object",
    output_field="fake_caption_phrase_grounding",
    )

In [None]:
dataset

In [None]:
dataset.first()

##### Detection

```python
"detection": {
        "params": {"detection_type": ["detection", "dense_region_caption", "region_proposal", "open_vocabulary_detection"],
                   "text_prompt": str},
        "required": [],
        "task_mapping": {
            "detection": "<OD>",
            "dense_region_caption": "<DENSE_REGION_CAPTION>",
            "region_proposal": "<REGION_PROPOSAL>",
            "open_vocabulary_detection": "<OPEN_VOCABULARY_DETECTION>",
            None: "<OD>"  # Default value
        }
    },
```

In [None]:
run_florence2_model(
    dataset,
    operation="detection",
    detection_type="detection"
    model_path=MODEL_PATH,
    # output_field="florence_phrase_grounding",
    )

In [None]:
run_florence2_model(
    dataset,
    operation="detection",
    detection_type="dense_region_caption"
    model_path=MODEL_PATH,
    # output_field="florence_phrase_grounding",
    )

In [None]:
run_florence2_model(
    dataset,
    operation="detection",
    detection_type="region_proposal"
    model_path=MODEL_PATH,
    # output_field="florence_phrase_grounding",
    )

In [None]:
run_florence2_model(
    dataset,
    operation="detection",
    detection_type="open_vocabulary_detection"
    model_path=MODEL_PATH,
    # output_field="florence_phrase_grounding",
    )

##### Segmentation

```python

    "segmentation": {
        "params": {"expression": str, "expression_field": str},
        "required": [],  # Will be validated in code
        "task": "<REFERRING_EXPRESSION_SEGMENTATION>"
    }
```

In [None]:
run_florence2_model(
    dataset,
    operation="segmentation",
    expression=""
    model_path=MODEL_PATH,
    # output_field="florence_phrase_grounding",
    )

In [None]:
run_florence2_model(
    dataset,
    operation="segmentation",
    expression_field=""
    model_path=MODEL_PATH,
    # output_field="florence_phrase_grounding",
    )

##### Ocr

```python
    "ocr": {
        "params": {"store_region_info": bool},
        "required": [],
        "task": "<OCR>",
        "region_task": "<OCR_WITH_REGION>"
    },
```

In [None]:
run_florence2_model(
    dataset,
    model_path=MODEL_PATH,
    store_region_info=True
    # output_field="florence_phrase_grounding",
    )

In [None]:
run_florence2_model(
    dataset,
    model_path=MODEL_PATH,
    store_region_info=False
    # output_field="florence_phrase_grounding",
    )