## Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
!nvidia-smi

Sun Mar  9 00:37:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   28C    P8             11W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

**NOTE:** To make it easier for us to manage datasets, images and models we create a `HOME` constant.

In [None]:
import os
HOME = os.getcwd()
print("HOME:", HOME)

HOME: /content


### Install SAM2 and dependencies

In [None]:
!git clone https://github.com/facebookresearch/segment-anything-2.git
%cd {HOME}/segment-anything-2
!pip install -e . -q
!python setup.py build_ext --inplace

Cloning into 'segment-anything-2'...
remote: Enumerating objects: 1070, done.[K
remote: Total 1070 (delta 0), reused 0 (delta 0), pack-reused 1070 (from 1)[K
Receiving objects: 100% (1070/1070), 134.70 MiB | 21.67 MiB/s, done.
Resolving deltas: 100% (376/376), done.
/content/segment-anything-2
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m1

### Imports

In [None]:
import torch
import torchvision
import os
import requests
import json
from PIL import Image, ImageOps
from torchvision import transforms
import torch
from transformers import CLIPProcessor, CLIPModel
from torchvision import transforms
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sam2.sam2_image_predictor import SAM2ImagePredictor
#from ultralytics import YOLO

In [None]:
# Define the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14') # Loading the Large DINO Backbone/feature extractor

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitl14_pretrain.pth
100%|██████████| 1.13G/1.13G [00:07<00:00, 170MB/s]


In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large")

point_labels = [1] # 1 indicates the foreground

sam2_hiera_large.pt:   0%|          | 0.00/898M [00:00<?, ?B/s]

In [None]:
def visualize_image_mask_overlay(image_path, mask):
    """
    Visualize the binary mask overlaid on the original image.

    Args:
        image_path (str): Path to the original image.
        mask (np.ndarray): Binary mask of the same size as the image.
    """
    # Load the image
    image = Image.open(image_path).convert("RGB")
    image = np.array(image)

    # Overlay the mask on the image
    plt.figure(figsize=(10, 10))
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title("Original Image")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.imshow(image)
    plt.imshow(mask, alpha=0.5, cmap="jet")  # Use transparency for the mask
    plt.title("Image with Mask Overlay")
    plt.axis("off")

    plt.show()

In [None]:
def visualize_mask(image_path, mask):
    """
    Visualize the masked part as transparent and the non-masked part as black.

    Args:
        image_path (str): Path to the original image.
        mask (np.ndarray): Binary mask of the same size as the image.
    """
    from matplotlib.colors import to_rgba

    # Load the image
    image = Image.open(image_path).convert("RGBA")  # Use RGBA for transparency
    image = np.array(image)  # Convert to NumPy array

    # Create a copy for applying transparency
    transparent_image = np.zeros_like(image)
    transparent_image[:, :, 3] = 255  # Set alpha channel to fully opaque by default

    # Mask logic: non-masked parts (where mask == 0) become black
    transparent_image[mask == 0, :3] = [0, 0, 0]  # Set RGB to black for non-masked areas
    transparent_image[mask == 0, 3] = 255         # Set alpha to fully opaque for non-masked

    # Masked part remains as transparent (alpha = 0)
    transparent_image[mask > 0, 3] = 0            # Set alpha to fully transparent for masked areas

    # Display the results
    plt.figure(figsize=(10, 10))
    plt.subplot(1, 2, 1)
    plt.imshow(Image.open(image_path).convert("RGB"))
    plt.title("Original Image")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.imshow(transparent_image)
    plt.title("Masked Transparent Image")
    plt.axis("off")

    plt.show()

In [None]:
import cv2
import matplotlib.pyplot as plt

def visualize_bbox(image_path, gt_bbox):
    """
    Visualizes the bounding box on an image using the AP-10K format.

    Parameters:
    - image_path (str): Path to the image file.
    - gt_bbox (list): Bounding box in AP-10K format [x_min, y_min, width, height].
    """
    # Load the image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct visualization

    # Extract bbox parameters
    x_min, y_min, width, height = gt_bbox

    # Draw the bounding box
    cv2.rectangle(image, (x_min, y_min), (x_min + width, y_min + height), color=(255, 0, 0), thickness=2)

    # Display the image
    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.axis("off")
    plt.title("Bounding Box Visualization")
    plt.show()

In [None]:
transform = transforms.Compose([
                                transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(),
                                transforms.Normalize(
                                mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225]
                                )])


transform1 = transforms.Compose([
                                transforms.Resize(520),
                                transforms.CenterCrop(518), #should be multiple of model patch_size
                                transforms.ToTensor(),
                                transforms.Normalize(mean=0.5, std=0.2)
                                ])

# Transform image for CLIP
clip_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # CLIP expects 224x224 input
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                         std=[0.26862954, 0.26130258, 0.27577711])  # CLIP normalization
])

In [None]:
import os
import json
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

dinov2_vitl14.to(device)
predictor.model.to(device)

# Define paths
ap10k_folder = "/content/drive/MyDrive/ap10k/"
data_folder = os.path.join(ap10k_folder, "data")
annotations_path = os.path.join(ap10k_folder, "annotations", "annotations.json")

# Load AP10K annotations
with open(annotations_path, 'r') as file:
    annotations_data = json.load(file)

# Organize annotations by image_id
image_annotations = {ann["image_id"]: ann for ann in annotations_data["annotations"]}

# Assuming 'categories' is a list where each element is a dictionary containing 'id' and 'name'
categories = annotations_data['categories']  # Replace with your actual categories list or dictionary

# Create a mapping from category_id to category_name
category_mapping = {category['id']: category['name'] for category in categories}
print(category_mapping)

# Dictionary to store features by species
species_features = {}

# DINOv2 settings
patch_size = dinov2_vitl14.patch_size  # 14 for ViT-L/14
patch_h, patch_w = 224 // patch_size, 224 // patch_size
feat_dim = 1024  # ViT-L/14
cnt = 0
img_size = 840  # img size for resizing as DINOv2 + SD paper resizes to 840x840

# Iterate through images in the data folder
with torch.no_grad():
    for file in os.listdir(data_folder):
        cnt += 1
        if cnt % 100 == 0:
            print(f"Have Generated Features for {cnt} Images.")
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            file_path = os.path.join(data_folder, file)
            image_id = int(os.path.splitext(file)[0])

            # Get species and bbox info from annotations
            annotation = image_annotations.get(image_id)
            if not annotation:
                continue  # Skip if no annotation
                print("Could Not Find Annotation!!")
            species_name = category_mapping[annotation["category_id"]]
            gt_bbox = annotation["bbox"]

            # Open image
            image = Image.open(file_path).convert("RGB")

            image_np = np.array(image)

            # Extract bounding box
            x_min, y_min, width, height = gt_bbox
            x_max, y_max = x_min + width, y_min + height
            bbox = [x_min, y_min, x_max, y_max]

            # Get segmentation mask using SAM
            predictor.set_image(image)
            masks, _, _ = predictor.predict(box=bbox, multimask_output=True)

            # Select the largest mask
            largest_mask, max_area = None, 0
            for mask in masks:
                area = mask.sum().item()
                if area > max_area:
                    max_area = area
                    largest_mask = mask

            if largest_mask is not None:
                # Convert image to grayscale
                grayscale_image = np.dot(image_np[...,:3], [0.2989, 0.5870, 0.1140])
                grayscale_image = grayscale_image / 255.0 * 0.9 + 0.1

                # Apply mask
                grayscale_image[largest_mask == 1] = np.maximum(grayscale_image[largest_mask == 1], 0.1)
                grayscale_image = (grayscale_image * 255).astype(np.uint8)

                # Apply grayscale mask to original image
                image_with_mask = image_np.copy()
                image_with_mask[largest_mask == 0] = 0
                image_with_mask[largest_mask == 1] = np.expand_dims(grayscale_image, axis=-1)[largest_mask == 1]

                # Crop
                cropped_image = image_with_mask[y_min:y_max, x_min:x_max]
                final_image = Image.fromarray(cropped_image)

                original_width, original_height = final_image.size

                # Compute new size while maintaining aspect ratio
                aspect_ratio = original_width / original_height

                if original_width > original_height:
                    new_width = img_size
                    new_height = int(img_size / aspect_ratio)
                else:
                    new_height = img_size
                    new_width = int(img_size * aspect_ratio)

                # Resize the image while preserving aspect ratio
                final_image = final_image.resize((new_width, new_height), Image.BICUBIC)
                resized_width, resized_height = final_image.size  # Get updated size after resizing

                # Calculate the padding needed on each side
                left_pad = (img_size - final_image.size[0]) // 2
                top_pad = (img_size - final_image.size[1]) // 2
                right_pad = img_size - final_image.size[0] - left_pad
                bottom_pad = img_size - final_image.size[1] - top_pad

                # Apply padding correctly
                image_with_padding = ImageOps.expand(final_image, (left_pad, top_pad, right_pad, bottom_pad), fill='black')
                final_image = image_with_padding

                # Feature extraction
                img_t = transform1(final_image).to(device)
                features_dict = dinov2_vitl14.forward_features(img_t.unsqueeze(0))
                dinov2_features = features_dict['x_norm_patchtokens']

                image_input = clip_transform(image).unsqueeze(0).to(device)
                clip_features = clip_model.get_image_features(image_input)
                # Following Fusing DINO & SD paper for how to reshape CLIP features to match DINOv2
                clip_features_expanded = clip_features.unsqueeze(1).expand(1, dinov2_features.shape[1], -1)

                # Normalize DINOv2 and CLIP features - Following Fusing DINO & SD paper
                dinov2_features = dinov2_features / dinov2_features.norm(dim=-1, keepdim=True)
                clip_features_expanded = clip_features_expanded / clip_features_expanded.norm(dim=-1, keepdim=True)
                #clip_features_expanded = clip_features.unsqueeze(1).expand(-1, dinov2_features.shape[1], -1)

                combined_features = torch.cat([dinov2_features, clip_features_expanded], dim=-1)

                # Store features in species dictionary
                if species_name not in species_features:
                    species_features[species_name] = []
                    print(f"First image for {species_name} with size {final_image.size}")
                    plt.figure(figsize=(10, 10))
                    plt.imshow(final_image)
                    plt.axis('off')  # Hide axes
                    plt.show()
                species_features[species_name].append(combined_features)

# Convert lists to tensors
for species in species_features:
    species_features[species] = torch.cat(species_features[species], dim=0)

print("Feature extraction completed!")


## Analysis

### Quantitative Species Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Dictionary to store the average feature vector for each species
species_avg_vectors = {}

for species_name, species_feature in species_features.items():
    num_images = species_feature.shape[0]

    # Flattening the output of each image into a vector
    species_feature = species_feature.reshape(num_images, -1)
    avg_vector = species_feature.mean(axis=0)

    # Store the average feature vector
    species_avg_vectors[species_name] = avg_vector

# Get the average feature vector for antelopes
antelope_vector = species_avg_vectors.get("antelope")
if antelope_vector is None:
    raise ValueError("No feature vector found for 'antelope'. Ensure it's in the species list.")

# Compute cosine similarity of all species with respect to antelopes
cosine_similarities = []
for species_name, avg_vector in species_avg_vectors.items():
    similarity = cosine_similarity(avg_vector.cpu().reshape(1, -1), antelope_vector.cpu().reshape(1, -1))[0][0]
    cosine_similarities.append((species_name, similarity))

# Sort species by descending cosine similarity
cosine_similarities.sort(key=lambda x: x[1], reverse=True)

# Print results
for species_name, similarity in cosine_similarities:
    print(f"{species_name}: {similarity:.4f}")

antelope: 1.0000
deer: 0.9020
giraffe: 0.8307
cheetah: 0.8216
argali sheep: 0.8103
moose: 0.8085
fox: 0.8077
buffalo: 0.8063
sheep: 0.8037
zebra: 0.8018
rabbit: 0.7990
leopard: 0.7968
bobcat: 0.7955
cow: 0.7949
wolf: 0.7895
bison: 0.7894
spider monkey: 0.7894
monkey: 0.7888
weasel: 0.7875
elephant: 0.7798
lion: 0.7752
tiger: 0.7737
squirrel: 0.7701
dog: 0.7674
rhino: 0.7647
hippo: 0.7630
pig: 0.7556
jaguar: 0.7556
brown bear: 0.7549
raccoon: 0.7487
mouse: 0.7487
polar bear: 0.7468
horse: 0.7461
snow leopard: 0.7381
otter: 0.7373
beaver: 0.7351
king cheetah: 0.7341
panda: 0.7317
rat: 0.7257
skunk: 0.7244
marmot: 0.7227
noisy night monkey: 0.7222
cat: 0.7159
chimpanzee: 0.7121
alouatta: 0.7060
panther: 0.6988
hamster: 0.6717
black bear: 0.6236
gorilla: 0.6136


In [None]:
import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors
from collections import Counter

# Flatten and collect all species feature vectors (excluding antelope)
all_features = []
labels = []

for species_name, species_feature in species_features.items():
    if species_name == "antelope":
        continue  # Skip antelope embeddings

    num_images = species_feature.shape[0]
    species_feature = species_feature.reshape(num_images, -1).cpu().numpy()  # Convert to CPU NumPy
    all_features.append(species_feature)
    labels.extend([species_name] * num_images)  # Track species labels

# Stack all non-antelope feature vectors into a single matrix
all_features = np.vstack(all_features)

# Get antelope features (to query k-NN)
if "antelope" not in species_features:
    raise ValueError("No feature vector found for 'antelope'. Ensure it's in the species list.")

antelope_features = species_features["antelope"].reshape(-1, all_features.shape[1]).cpu().numpy()

# Train k-NN only on non-antelope species
knn = NearestNeighbors(n_neighbors=10, metric="cosine")  # Adjust k as needed
knn.fit(all_features)

# Find the k nearest neighbors for antelope images (excluding other antelopes)
distances, indices = knn.kneighbors(antelope_features)

# Count occurrences of nearest species
nearest_species = Counter([labels[i] for i in indices.flatten()])

# Print species ranked by frequency in nearest neighbors
print("\nMost Similar Species to Antelopes (Excluding Other Antelopes):")
for species_name, count in nearest_species.most_common():
    print(f"{species_name}: {count}")


Most Similar Species to Antelopes (Excluding Other Antelopes):
deer: 35
giraffe: 27
moose: 17
bison: 8
rabbit: 7
cheetah: 4
argali sheep: 4
sheep: 2
zebra: 2
leopard: 2
fox: 1
bobcat: 1


In [None]:
import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from collections import Counter

# Flatten and collect all species feature vectors (excluding antelope)
all_features = []
labels = []

for species_name, species_feature in species_features.items():
    if species_name == "antelope":
        continue  # Skip antelope embeddings

    num_images = species_feature.shape[0]
    species_feature = species_feature.reshape(num_images, -1).cpu().numpy()  # Convert to CPU NumPy
    all_features.append(species_feature)
    labels.extend([species_name] * num_images)  # Track species labels

# Stack all non-antelope feature vectors into a single matrix
all_features = np.vstack(all_features)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=10)  # Adjust n_components based on your dataset
all_features_pca = pca.fit_transform(all_features)

# Get antelope features and transform them using the same PCA model
if "antelope" not in species_features:
    raise ValueError("No feature vector found for 'antelope'. Ensure it's in the species list.")

antelope_features = species_features["antelope"].reshape(-1, all_features.shape[1]).cpu().numpy()
antelope_features_pca = pca.transform(antelope_features)  # Apply PCA transformation

# Train k-NN only on PCA-transformed non-antelope species
knn = NearestNeighbors(n_neighbors=10, metric="cosine")  # Adjust k as needed
knn.fit(all_features_pca)

# Find the k nearest neighbors for antelope images (excluding other antelopes)
distances, indices = knn.kneighbors(antelope_features_pca)

# Count occurrences of nearest species
nearest_species = Counter([labels[i] for i in indices.flatten()])

# Print species ranked by frequency in nearest neighbors
print("\nMost Similar Species to Antelopes (Using PCA & Excluding Other Antelopes):")
for species_name, count in nearest_species.most_common():
    print(f"{species_name}: {count}")


Most Similar Species to Antelopes (Using PCA & Excluding Other Antelopes):
giraffe: 32
deer: 22
cheetah: 18
moose: 11
leopard: 7
argali sheep: 7
buffalo: 4
sheep: 2
zebra: 2
bison: 2
wolf: 1
cow: 1
elephant: 1


### PCA Visualization

##### Antelopes Images with DINOv2 PCA

In [None]:
# 2nd PCA for only foreground patches
pca.fit(pre_pca_features[pca_features_fg].cpu())
pca_features_left = pca.transform(pre_pca_features[pca_features_fg].cpu())

for i in range(3):
    # min_max scaling
    pca_features_left[:, i] = (pca_features_left[:, i] - pca_features_left[:, i].min()) / (pca_features_left[:, i].max() - pca_features_left[:, i].min())

pca_features_rgb = pca_features.copy()
# for black background
pca_features_rgb[pca_features_bg] = 0
# new scaled foreground features
pca_features_rgb[pca_features_fg] = pca_features_left

# Construct the path to the subfolder
subfolder_path = os.path.join(folder_path, pca_animal)

# Get a list of image files in the subfolder
image_files = [f for f in os.listdir(subfolder_path)]

# reshaping to numpy image format
pca_features_rgb = pca_features_rgb.reshape(len(image_files), patch_h, patch_w, 3)
for i in range(4):
    plt.subplot(2, 2, i+1)
    plt.imshow(pca_features_rgb[i])

plt.show()

##### Un-altered Antelope Images

In [None]:
# Construct the path to the subfolder
subfolder_path = os.path.join(folder_path, pca_animal)

# Get a list of image files in the subfolder
image_files = [f for f in os.listdir(subfolder_path)]

# Loop through the first 4 images
for i, img_file in enumerate(image_files[:4]):
  img_path = os.path.join(subfolder_path, img_file)
  img = Image.open(img_path).convert('RGB').resize((1000, 700))
  plt.subplot(2, 2, i+1)
  plt.imshow(img)

plt.show()