## Importing the 2 JSON files and paths

In [25]:
import pandas as pd
import json
import os
from PIL import Image
import sys
import bioclip
from tqdm.notebook import tqdm

# Load your prepared data
DATA_PATH = '../data/processed/final_data.json'
CLASS_NAMES_PATH = '../data/processed/class_names.json'

with open(DATA_PATH, 'r') as f:
    image_data = json.load(f)

with open(CLASS_NAMES_PATH, 'r') as f:
    class_names = json.load(f)

print(f"First few entries in the data:", image_data[:5])
print(f"Printing the class names:", class_names[:7])

print(f"Loaded {len(image_data)} image records.")
print(f"Loaded {len(class_names)} unique class names.")

First few entries in the data: [{'image_path': '../data/raw/Tick_Images-6_25_25/ZOE-0013-01-1.jpg', 'true_label': 'Ixodes', 'sample_id': 'ZOE-0013-01', 'view': 'dorsal', 'sex': 'Female', 'life_stage': 'Adult', 'attached': 'Yes'}, {'image_path': '../data/raw/Tick_Images-6_25_25/ZOE-0013-01-2.jpg', 'true_label': 'Ixodes', 'sample_id': 'ZOE-0013-01', 'view': 'ventral', 'sex': 'Female', 'life_stage': 'Adult', 'attached': 'Yes'}, {'image_path': '../data/raw/Tick_Images-6_25_25/ZOE-0013-02-1.jpg', 'true_label': 'Amblyomma\xa0americanum', 'sample_id': 'ZOE-0013-02', 'view': 'dorsal', 'sex': 'Male', 'life_stage': 'Adult', 'attached': 'No'}, {'image_path': '../data/raw/Tick_Images-6_25_25/ZOE-0013-02-2.jpg', 'true_label': 'Amblyomma\xa0americanum', 'sample_id': 'ZOE-0013-02', 'view': 'ventral', 'sex': 'Male', 'life_stage': 'Adult', 'attached': 'No'}, {'image_path': '../data/raw/Tick_Images-6_25_25/ZOE-0013-03-1.jpg', 'true_label': 'Ixodes', 'sample_id': 'ZOE-0013-03', 'view': 'dorsal', 'sex': '

## Setting up BioClip with the data

In [26]:
# Import the specific pybioclip classifier
from bioclip import CustomLabelsClassifier

# Initialize the classifier with your list of class names
# The library handles all the model loading and setup internally.
classifier = CustomLabelsClassifier(
    cls_ary=class_names)

print("BioCLIP classifier initialized successfully.")

BioCLIP classifier initialized successfully.


## Testing bioclip on the first image 

In [27]:
# Test BioClip on the first image in your tick dataset
first_item = image_data[0]
image_path = first_item["image_path"]
true_label = first_item["true_label"]

# Run prediction
predictions = classifier.predict(image_path)
for pred in predictions:
    print(f"{pred['classification']:30s}  |  Score: {round(pred['score'], 4)}")

# Show top prediction
top_prediction = predictions[0]["classification"]
score = predictions[0]["score"]

print("\nPredicted:", top_prediction)
print("Confidence:", round(score, 4))
print("True label:", true_label)
print("✅ Correct!" if top_prediction == true_label else "❌ Incorrect.")

100%|██████████| 1/1 [00:00<00:00,  1.13images/s]

Dermacentor variabilis          |  Score: 0.4554
Dermacentor variablis           |  Score: 0.2608
Ixodes                          |  Score: 0.1205
Ixodes scapularis               |  Score: 0.1189
Haemaphysalis longicornis       |  Score: 0.0315
Amblyomma maculatum             |  Score: 0.0125
Amblyomma americanum            |  Score: 0.0004

Predicted: Dermacentor variabilis
Confidence: 0.4554
True label: Ixodes
❌ Incorrect.





In [28]:
# Test BioClip on the first image in your tick dataset
first_item = image_data[0]
image_path = first_item["image_path"]
true_label = first_item["true_label"]

# Run prediction
predictions = classifier.predict(image_path)
for pred in predictions:
    print(f"{pred['classification']:30s}  |  Score: {round(pred['score'], 4)}")

# Show top prediction
top_prediction = predictions[0]["classification"]
score = predictions[0]["score"]

print("\nPredicted:", top_prediction)
print("Confidence:", round(score, 4))
print("True label:", true_label)
print("✅ Correct!" if top_prediction == true_label else "❌ Incorrect.")

100%|██████████| 1/1 [00:00<00:00,  1.95images/s]

Dermacentor variabilis          |  Score: 0.4554
Dermacentor variablis           |  Score: 0.2608
Ixodes                          |  Score: 0.1205
Ixodes scapularis               |  Score: 0.1189
Haemaphysalis longicornis       |  Score: 0.0315
Amblyomma maculatum             |  Score: 0.0125
Amblyomma americanum            |  Score: 0.0004

Predicted: Dermacentor variabilis
Confidence: 0.4554
True label: Ixodes
❌ Incorrect.





## Running all the images on BioClip, getting the dorsal, ventral and total accuracies.

In [29]:
from collections import defaultdict

# Group entries by sample_id
samples = defaultdict(dict)
for entry in image_data:
    samples[entry['sample_id']][entry['view']] = entry

dorsal_correct = 0
ventral_correct = 0
highest_conf_correct = 0
total = 0

# Per-class stats
dorsal_class_correct = defaultdict(int)
dorsal_class_total = defaultdict(int)
ventral_class_correct = defaultdict(int)
ventral_class_total = defaultdict(int)
highest_conf_class_correct = defaultdict(int)
highest_conf_class_total = defaultdict(int)

for sample_id, views in samples.items():
    dorsal = views.get('dorsal')
    ventral = views.get('ventral')
    if not dorsal or not ventral:
        continue  # skip incomplete pairs

    # Predict dorsal
    dorsal_preds = classifier.predict(dorsal['image_path'])
    dorsal_top = dorsal_preds[0]['classification']
    dorsal_score = dorsal_preds[0]['score']
    dorsal_true = dorsal['true_label']

    # Predict ventral
    ventral_preds = classifier.predict(ventral['image_path'])
    ventral_top = ventral_preds[0]['classification']
    ventral_score = ventral_preds[0]['score']
    ventral_true = ventral['true_label']

    # Dorsal accuracy
    dorsal_class_total[dorsal_true] += 1
    if dorsal_top == dorsal_true:
        dorsal_correct += 1
        dorsal_class_correct[dorsal_true] += 1

    # Ventral accuracy
    ventral_class_total[ventral_true] += 1
    if ventral_top == ventral_true:
        ventral_correct += 1
        ventral_class_correct[ventral_true] += 1

    # Highest confidence
    if dorsal_score >= ventral_score:
        best_pred = dorsal_top
        best_true = dorsal_true
    else:
        best_pred = ventral_top
        best_true = ventral_true
    highest_conf_class_total[best_true] += 1
    if best_pred == best_true:
        highest_conf_correct += 1
        highest_conf_class_correct[best_true] += 1

    total += 1

print(f"Dorsal accuracy: {dorsal_correct/total:.3f}")
print(f"Ventral accuracy: {ventral_correct/total:.3f}")
print(f"Highest confidence accuracy: {highest_conf_correct/total:.3f}")
print(f"Total evaluated samples: {total}")

# Per-class accuracy
print("\nPer-class dorsal accuracy:")
for cls in sorted(dorsal_class_total):
    acc = dorsal_class_correct[cls] / dorsal_class_total[cls]
    print(f"{cls:30s} | {acc:.3f}")

print("\nPer-class ventral accuracy:")
for cls in sorted(ventral_class_total):
    acc = ventral_class_correct[cls] / ventral_class_total[cls]
    print(f"{cls:30s} | {acc:.3f}")

print("\nPer-class highest confidence accuracy:")
for cls in sorted(highest_conf_class_total):
    acc = highest_conf_class_correct[cls] / highest_conf_class_total[cls]
    print(f"{cls:30s} | {acc:.3f}")

100%|██████████| 1/1 [00:00<00:00,  1.71images/s]
100%|██████████| 1/1 [00:00<00:00,  1.95images/s]
100%|██████████| 1/1 [00:00<00:00,  1.84images/s]
100%|██████████| 1/1 [00:00<00:00,  1.85images/s]
100%|██████████| 1/1 [00:00<00:00,  2.10images/s]
100%|██████████| 1/1 [00:00<00:00,  2.04images/s]
100%|██████████| 1/1 [00:00<00:00,  1.98images/s]
100%|██████████| 1/1 [00:00<00:00,  1.95images/s]
100%|██████████| 1/1 [00:00<00:00,  1.99images/s]
100%|██████████| 1/1 [00:00<00:00,  2.01images/s]
100%|██████████| 1/1 [00:00<00:00,  1.94images/s]
100%|██████████| 1/1 [00:00<00:00,  2.03images/s]
100%|██████████| 1/1 [00:00<00:00,  1.90images/s]
100%|██████████| 1/1 [00:00<00:00,  1.87images/s]
100%|██████████| 1/1 [00:00<00:00,  1.91images/s]
100%|██████████| 1/1 [00:00<00:00,  1.82images/s]
100%|██████████| 1/1 [00:00<00:00,  1.95images/s]
100%|██████████| 1/1 [00:00<00:00,  1.98images/s]
100%|██████████| 1/1 [00:00<00:00,  2.00images/s]
100%|██████████| 1/1 [00:00<00:00,  1.84images/s]


Dorsal accuracy: 0.360
Ventral accuracy: 0.439
Highest confidence accuracy: 0.393
Total evaluated samples: 369

Per-class dorsal accuracy:
Amblyomma maculatum            | 1.000
Amblyomma americanum           | 0.205
Dermacentor variabilis         | 0.846
Dermacentor variablis          | 0.056
Haemaphysalis longicornis      | 0.857
Ixodes                         | 0.115
Ixodes scapularis              | 0.864

Per-class ventral accuracy:
Amblyomma maculatum            | 0.667
Amblyomma americanum           | 0.051
Dermacentor variabilis         | 0.423
Dermacentor variablis          | 0.430
Haemaphysalis longicornis      | 0.857
Ixodes                         | 0.082
Ixodes scapularis              | 0.830

Per-class highest confidence accuracy:
Amblyomma maculatum            | 1.000
Amblyomma americanum           | 0.205
Dermacentor variabilis         | 0.846
Dermacentor variablis          | 0.113
Haemaphysalis longicornis      | 1.000
Ixodes                         | 0.098
Ixodes scapu


