## Loading in the Test Data 

In [6]:
import sys
print(sys.executable)
print(sys.path)
import bioclip
print(bioclip.__file__)
# print(bioclip.__version__)  # If __version__ exists


/Users/jayjivandas/Research_Imageomics/BioClip/env/bin/python
['/Users/jayjivandas/Research_Imageomics/BioClip/env/lib/python311.zip', '/Users/jayjivandas/Research_Imageomics/BioClip/env/lib/python3.11', '/Users/jayjivandas/Research_Imageomics/BioClip/env/lib/python3.11/lib-dynload', '', '/Users/jayjivandas/Research_Imageomics/BioClip/env/lib/python3.11/site-packages', '/Users/jayjivandas/Research_Imageomics/BioClip/pybioclip/src', '/var/folders/tc/csl5ykwn1v36vwqf20swsn880000gn/T/tmpx8v67igt']
/Users/jayjivandas/Research_Imageomics/BioClip/pybioclip/src/bioclip/__init__.py


In [7]:
import json

with open("test_data.json", "r") as f:
    test_data = json.load(f)

print("Loaded", len(test_data), "test examples")
print("Example:", test_data[20])

Loaded 5794 test examples
Example: {'image_id': 39, 'img_path': 'CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0001_796111.jpg', 'label_id': 1, 'label_name': '001.Black_footed_Albatross'}


## Setting up the Class names and removing the ID numbers attached to them for BioCLip

In [8]:
# Updated class name parsing to split "001.Name" into just "Name" to send it to BioClip arbitrarily rather than numbered
class_id_to_name = {}
class_names = []

with open("CUB_200_2011/classes.txt", "r") as f:
    for i, line in enumerate(f):
        full = line.strip()
        class_number, class_name = full.split(".")
        class_id = i + 1  # class IDs start at 1
        class_id_to_name[class_id] = class_name
        class_names.append(class_name)

print("Loaded", len(class_names), "class names")
print("First 5:", class_names[:5])


Loaded 200 class names
First 5: ['Black_footed_Albatross', 'Laysan_Albatross', 'Sooty_Albatross', 'Groove_billed_Ani', 'Crested_Auklet']


## Setting up BioClip with the Custom Labels from the CUB Dataset

In [9]:
from bioclip import CustomLabelsClassifier

#settign up BioClip with custom labels
classifier = CustomLabelsClassifier(
    cls_ary = class_names)

open_clip_model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

open_clip_config.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

## Testing BioClips Output on the first test image

In [10]:
# Pick the first test image
first_item = test_data[0]
image_path = first_item["img_path"]
true_label = first_item["label_name"].split(".", 1)[1]   # Get the class name after the last dot

# Run prediction
predictions = classifier.predict(image_path)
for pred in predictions:
    print(f"{pred['classification']:30s}  |  Score: {round(pred['score'], 4)}")
# Show top prediction
top_prediction = predictions[0]["classification"]
score = predictions[0]["score"]

print("Predicted:", top_prediction)
print("Confidence:", round(score, 4))
print("True label:", true_label)
print("✅ Correct!" if top_prediction == true_label else "❌ Incorrect.")


100%|██████████| 1/1 [00:00<00:00,  1.10images/s]

Black_footed_Albatross          |  Score: 0.9541
Laysan_Albatross                |  Score: 0.0371
Sooty_Albatross                 |  Score: 0.0088
Frigatebird                     |  Score: 0.0
Northern_Fulmar                 |  Score: 0.0
Rhinoceros_Auklet               |  Score: 0.0
Pomarine_Jaeger                 |  Score: 0.0
Brown_Pelican                   |  Score: 0.0
Horned_Puffin                   |  Score: 0.0
Crested_Auklet                  |  Score: 0.0
Heermann_Gull                   |  Score: 0.0
Ivory_Gull                      |  Score: 0.0
Brandt_Cormorant                |  Score: 0.0
White_Pelican                   |  Score: 0.0
Pacific_Loon                    |  Score: 0.0
Western_Gull                    |  Score: 0.0
Parakeet_Auklet                 |  Score: 0.0
Slaty_backed_Gull               |  Score: 0.0
Elegant_Tern                    |  Score: 0.0
Red_faced_Cormorant             |  Score: 0.0
Red_legged_Kittiwake            |  Score: 0.0
Least_Auklet             




## Running the entire test data on BioClip and getting accuracy numbers.

In [11]:
from collections import defaultdict

correct = 0
total = 0
class_correct = defaultdict(int)
class_total = defaultdict(int)

for image in test_data:
    image_path = image["img_path"]
    true_label = image["label_name"].split(".",1)[1]  # Get the class name after the last dot

    #  Run BioCLIP prediction
    predictions = classifier.predict(image_path)

    # Get top prediction
    top_prediction = predictions[0]["classification"]

    # Track overall stats
    total += 1
    if top_prediction == true_label:
        correct += 1
        class_correct[true_label] += 1
    class_total[true_label] += 1

#  Overall accuracy
overall_acc = correct / total
print(f"\n Overall Accuracy: {overall_acc:.4f}")



100%|██████████| 1/1 [00:00<00:00,  3.54images/s]
100%|██████████| 1/1 [00:00<00:00,  3.32images/s]
100%|██████████| 1/1 [00:00<00:00,  3.56images/s]
100%|██████████| 1/1 [00:00<00:00,  3.45images/s]
100%|██████████| 1/1 [00:00<00:00,  3.36images/s]
100%|██████████| 1/1 [00:00<00:00,  3.45images/s]
100%|██████████| 1/1 [00:00<00:00,  3.15images/s]
100%|██████████| 1/1 [00:00<00:00,  2.67images/s]
100%|██████████| 1/1 [00:00<00:00,  2.50images/s]
100%|██████████| 1/1 [00:00<00:00,  3.14images/s]
100%|██████████| 1/1 [00:00<00:00,  3.10images/s]
100%|██████████| 1/1 [00:00<00:00,  3.39images/s]
100%|██████████| 1/1 [00:00<00:00,  3.54images/s]
100%|██████████| 1/1 [00:00<00:00,  3.52images/s]
100%|██████████| 1/1 [00:00<00:00,  3.05images/s]
100%|██████████| 1/1 [00:00<00:00,  3.44images/s]
100%|██████████| 1/1 [00:00<00:00,  3.59images/s]
100%|██████████| 1/1 [00:00<00:00,  3.62images/s]
100%|██████████| 1/1 [00:00<00:00,  3.55images/s]
100%|██████████| 1/1 [00:00<00:00,  3.59images/s]



 Overall Accuracy: 0.9001





## Getting the Per class accuracy based on the results

In [12]:
#  Per-class accuracy
per_class_accuracy = []
for class_name in class_total:
    acc = class_correct[class_name] / class_total[class_name]
    per_class_accuracy.append((class_name, acc))

# Sort per-class accuracy (descending)
per_class_accuracy.sort(key=lambda x: x[1], reverse=True)

print("\n Top classes by accuracy:")
for class_name, acc in per_class_accuracy:
    print(f"{class_name:30s}  |  Accuracy: {acc:.3f}")

# Save to CSV
import pandas as pd
df = pd.DataFrame(per_class_accuracy, columns=["Class_Name", "Accuracy"])
df.to_csv("per_class_accuracy_BC_2.csv", index=False)
print("\n Saved per_class_accuracy_BC_2.csv")


 Top classes by accuracy:
Black_footed_Albatross          |  Accuracy: 1.000
Groove_billed_Ani               |  Accuracy: 1.000
Crested_Auklet                  |  Accuracy: 1.000
Red_winged_Blackbird            |  Accuracy: 1.000
Yellow_headed_Blackbird         |  Accuracy: 1.000
Bobolink                        |  Accuracy: 1.000
Indigo_Bunting                  |  Accuracy: 1.000
Painted_Bunting                 |  Accuracy: 1.000
Cardinal                        |  Accuracy: 1.000
Spotted_Catbird                 |  Accuracy: 1.000
Gray_Catbird                    |  Accuracy: 1.000
Yellow_breasted_Chat            |  Accuracy: 1.000
Eastern_Towhee                  |  Accuracy: 1.000
Brown_Creeper                   |  Accuracy: 1.000
Yellow_billed_Cuckoo            |  Accuracy: 1.000
Gray_crowned_Rosy_Finch         |  Accuracy: 1.000
Purple_Finch                    |  Accuracy: 1.000
Northern_Flicker                |  Accuracy: 1.000
Scissor_tailed_Flycatcher       |  Accuracy: 1.000
Verm