In [None]:
!pip install -r requirements.txt

We'll be making use a couple of plugins to help us with our analysis. Let's go ahead and install it now:

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/synthetic_gui_samples_plugins

In [None]:
!fiftyone plugins download https://github.com/ehofesmann/edit_label_attributes

## Downloading the Dataset

In this section, we'll download a dataset from Google Drive using `gdown`. The file is hosted at this URL:
https://drive.google.com/file/d/1YEhdlXuTV_nalVvKuXSHZa_JgkuIXjcG/view?usp=sharing

After downloading, we'll extract its contents to work with the data.


In [None]:
import gdown
import zipfile

# Download the file
url = "https://drive.google.com/uc?id=1YEhdlXuTV_nalVvKuXSHZa_JgkuIXjcG"
gdown.download(url, "data.zip", quiet=False)

# Extract the contents
with zipfile.ZipFile("data.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

## Loading Annotations

Now we'll load and explore the COCO4GUI annotations from the JSON file.

In [None]:
import json
import random
from pprint import pprint

# Load the annotations
annotations_path = "./data/annotations_coco.json"
with open(annotations_path, 'r') as f:
    annotations = json.load(f)

# Print basic info about the dataset
print(f"Keys in the annotations file: {list(annotations.keys())}")
print(f"Number of images: {len(annotations['images'])}")
print(f"Number of annotations: {len(annotations['annotations'])}")
print(f"Categories: {[cat['name'] for cat in annotations['categories']]}")

In [None]:
from PIL import Image

# Select a random image and its annotations
random_image = random.choice(annotations['images'])

image_id = random_image['id']

# Get annotations for this image
image_annotations = [a for a in annotations['annotations'] if a['image_id'] == image_id]

# Print image info and its annotations
print(f"Selected image: {random_image['file_name']}")

Image.open(f"./data/{random_image['file_name']}")


In [None]:
print(f"Image ID: {image_id}")
print(f"Image size: {random_image['width']} x {random_image['height']}")
print(f"Number of annotations for this image: {len(image_annotations)}")

# Print one annotation as an example
if image_annotations:
    print("\nExample annotation:")
    pprint(image_annotations[0])

## Loading Data into FiftyOne

[FiftyOne](https://voxel51.com/fiftyone/) is a powerful tool for visualizing, exploring and analyzing image datasets. We'll use a specialized COCO4GUI format to load our GUI interaction dataset into FiftyOne.

The [COCO4GUI](https://github.com/harpreetsahota204/coco4gui_fiftyone) format extends the standard COCO detection format to handle GUI-specific features:

- **Dual annotation support**: Both bounding boxes for UI elements and keypoints for interaction points

- **Sequence information**: Tracking user workflows and interaction chains

- **GUI metadata**: Application, platform, and timing information

- **Rich attributes**: Task descriptions, element information, and custom metadata


In [None]:
!curl -s https://raw.githubusercontent.com/harpreetsahota204/coco4gui_fiftyone/main/coco4gui.py -o coco4gui.py

In [None]:
import fiftyone as fo
from coco4gui import COCO4GUIDataset

# Define paths to our dataset
dataset_dir = "/Users/harpreetsahota/workspace/visual_agents_workshop/session_2"
data_path = "data"  # Image directory
labels_path = "data/annotations_coco.json"  # COCO annotations file

# Create the dataset
gui_dataset = fo.Dataset.from_dir(
    dataset_dir=dataset_dir,
    dataset_type=COCO4GUIDataset,
    name="session_2_dataset",
    data_path=data_path,
    labels_path=labels_path,
    persistent=True,
    include_sequence_info=False,  # Extract sequence info
    include_gui_metadata=True,   # Include GUI metadata
    extra_attrs=True,            # Include all attributes
)

In [None]:
gui_dataset

Let's see what a sample looks like.

In [None]:
sample = gui_dataset.skip(51).first()
sample

In [None]:
# 1. Accessing all detections for a sample
print("All detections for this sample:")
print(sample.detections.detections)


In [None]:
# 2. Accessing a specific detection by index
print("\nFirst detection:")
print(sample.detections.detections[0])

In [None]:
# 3. Filtering detections by label
hover_detections = [d for d in sample.detections.detections if d.label == "hover"]

print(f"\nFound {len(hover_detections)} hover detections")

In [None]:
# 4. Accessing specific attributes of a detection
if hover_detections:
    detection = hover_detections[0]
    print("\nDetection details:")
    print(f"- Label: {detection.label}")
    print(f"- Bounding box: {detection.bounding_box}")
    print(f"- Task description: {detection.task_description}")
    
    # Accessing nested attributes
    if hasattr(detection, "custom_metadata") and detection.custom_metadata:
        print(f"- Custom metadata value: {detection.custom_metadata['value']}")

# Note: FiftyOne allows you to access labels as Python objects
# with dot notation, making it easy to navigate complex label structures


## Exploring the Dataset

With the dataset loaded in FiftyOne, you can:

1. **Visualize annotations**: See both bounding boxes and keypoints in the same view

2. **Filter interactions**: Use the query interface to find specific interaction types like clicks or drags

3. **Explore sequences**: View related interactions in a workflow sequence

4. **View metadata**: See application info, platform details, and custom attributes

5. **Create views**: Focus on subsets of data for detailed analysis

Try these example queries in the FiftyOne App:

- `F("ground_truth.detections.label") == "click"` - Show only click interactions

- `F("application") == "Chrome"` - Filter by application

In [None]:
# Example: Creating filtered views of the dataset
from fiftyone import ViewField as F

# Create a view with only click interactions
clicks = gui_dataset.filter_labels("detections", F("label") == "click")

gui_dataset.save_view("click_boxes", clicks)

print(f"Click interactions (bounding boxes): {len(clicks)}")


In [None]:
# Create a view with missing task descriptions

no_tasks = gui_dataset.filter_labels("detections", F("task_description") == "")

gui_dataset.save_view("no_tasks", no_tasks)


## Analyzing Bounding Box Areas

To compute the area of bounding boxes in FiftyOne, you can use a [ViewExpression]((https://docs.voxel51.com/api/fiftyone.core.expressions.html).). Bounding boxes are stored in the format [top-left-x, top-left-y, width, height], where width and height are relative to the image size (values between 0 and 1).

This computes the area as width × height in relative coordinates (as a fraction of the image).

You can use these expressions in view stages like `filter_labels` to filter detections by area.

In [None]:
from fiftyone import ViewField as F

rel_bbox_area = F("bounding_box")[2] * F("bounding_box")[3]

im_width, im_height = F("$metadata.width"), F("$metadata.height")

gui_dataset.set_field("detections.detections.relative_bbox_area", rel_bbox_area).save()

In [None]:
gui_dataset.skip(51).first()

## Advanced Embedding Analysis with FiftyOne

FiftyOne's Brain module provides powerful capabilities for working with embeddings. 

Here are some of the most powerful things you can do:

#### Visualizing Embeddings with Dimensionality Reduction

You can project high-dimensional embeddings into 2D or 3D space using techniques like:
- **UMAP**: Non-linear dimensionality reduction that preserves local relationships
- **t-SNE**: Excellent for visualizing clusters in high-dimensional data
- **PCA**: Linear dimensionality reduction that captures maximum variance

This visualization allows you to:
- Discover patterns and clusters in your GUI dataset
- Identify outliers or unusual interactions
- Compare different interaction types visually
- Explore the feature space of your data


Here's how to compute embeddings for GUI screenshots. We'll use a pretrained model from the FiftyOne Model Zoo. For actual GUI analysis, you might want a model trained on UI elements.

In [None]:
import fiftyone.brain as fob
import fiftyone.zoo as foz

clip_model = foz.load_zoo_model("open-clip-torch")

results = fob.compute_visualization(
    gui_dataset,
    model=clip_model,
    embeddings="clip_embeddings",
    method="umap",  # "umap", "tsne", "pca", etc
    brain_key="clip_viz"
)


### 2. Similarity Search

Build powerful similarity indexes for:
- Finding visually similar GUI screens
- Identifying related interaction patterns
- Retrieving nearest neighbors for any sample

FiftyOne supports multiple backends including:
- Sklearn (default, in-memory)
- Vector databases (Qdrant, Pinecone, Redis, Milvus, etc.)
- Document stores (MongoDB, Elasticsearch)

In [None]:
results = fob.compute_similarity(
    gui_dataset,
    backend="sklearn",  # Fast sklearn backend
    brain_key="clip_sim", 
    embeddings="clip_embeddings"
)


### Computing Representativeness

Create diverse subsets of your GUI dataset by:
- Selecting samples that best represent the entire distribution
- Ensuring coverage of all interaction types and patterns
- Avoiding redundancy in your selected examples


In [None]:
results = fob.compute_representativeness(
    gui_dataset,
    similarity_index="clip_sim"
)

# Patch views in FiftyOne
 
Patch views are a powerful feature in FiftyOne that allow you to:
- Extract regions of interest (like bounding boxes) as standalone samples
- Work with these regions directly instead of the full images
- Maintain all metadata and annotations from the parent sample
- Perform analysis on specific UI elements rather than entire screens

When we convert detections to patches (as in the next cell), each bounding box becomes its own sample with all associated metadata like task descriptions,action types, and element information preserved.


In [None]:
box_patches = gui_dataset.to_patches("detections")


In [None]:
box_patches.skip(51).first()

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Initialize the embedding model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True, device="mps")

# Iterate through patches and compute embeddings
for patch in tqdm(box_patches, desc="Computing embeddings"):
    # Construct prompt from the three fields
    task_desc = patch.detections.task_description if patch.detections.task_description else ""
    action_type = patch.detections.action_type if patch.detections.action_type else ""
    element_info = patch.detections.element_info if patch.detections.element_info else ""
    
    # Combine into prompt
    prompt = f"clustering: Task: {task_desc}; Action: {action_type}; Element: {element_info}"
    
    # Compute embedding
    embedding = model.encode(prompt)
    
    # Add as new field
    patch['text_embedding'] = embedding.tolist()
    patch.save()

print("Done! Embeddings added to all patches.")


In [None]:
results = fob.compute_visualization(
    box_patches,
    embeddings="text_embedding",
    method="umap",  # "umap", "tsne", "pca", etc
    brain_key="text_viz"
)

In [None]:
import fiftyone as fo

fo.launch_app(box_patches)


Could not connect session, trying again in 10 seconds



Let's get more hands on with the FiftyOne app!

In [None]:
import fiftyone as fo

fo.launch_app(gui_dataset)

# or run this in the terminal: fiftyone app launch