## Setting up imports and paths

In [1]:
import pandas as pd
import os
import json

# --- Configuration ---
# Define the paths to your data
METADATA_PATH = '../data/raw/annotations.csv'
IMAGE_DIR = '../data/raw/Tick_Images-6_25_25'
OUTPUT_JSON_PATH = '../data/processed/final_data.json'
CLASS_NAMES_PATH = '../data/processed/class_names.json'

## Loading in the csv

In [2]:
# 1. Load the data from your CSV into a DataFrame.
df = pd.read_csv(METADATA_PATH)
df.fillna('', inplace=True)
# removing duplicates based on 'Sample ID'
df.drop_duplicates(subset=['Sample ID'], keep='first', inplace=True)

# 2. Get All Available Image Filenames
# Get a set of all available image filenames for fast lookups.
# We convert all filenames to uppercase to handle potential inconsistencies (e.g., .jpg vs .JPG).
all_image_files = {f.upper(): f for f in os.listdir(IMAGE_DIR)}

print(f"Loaded {len(df)} records from the metadata file.")
print(f"Found {len(all_image_files)} image files to search through.")

Loaded 374 records from the metadata file.
Found 1056 image files to search through.


  df.fillna('', inplace=True)


## Determining the final list of valid ids
### cross referencing the csv specimens to that of the images 

In [3]:
# --- 3. Determine the Final List of Valid IDs ---
csv_ids = set(df['Sample ID'].astype(str)) # only unique IDs from the CSV
image_base_ids = {f.upper().rsplit('-', 1)[0] for f in all_image_files}
valid_ids = csv_ids.intersection(image_base_ids)

In [4]:
# --- 4. Filter DataFrame to Final Valid Set ---
final_df = df[df['Sample ID'].isin(valid_ids)].copy()
print(f"Validated. Proceeding to create JSON for {len(final_df)} specimens.")

Validated. Proceeding to create JSON for 370 specimens.


## Building the JSON structure

In [5]:
# --- 5. Build the JSON Structure ---
final_data_list = []
class_names_set = set()
missing_ids = []

for index, row in final_df.iterrows():
    base_id = row['Sample ID'].upper()
    dorsal_path, ventral_path = None, None

    dorsal_patterns = [f"{base_id}-01.JPG", f"{base_id}-1.JPG"]
    ventral_patterns = [f"{base_id}-02.JPG", f"{base_id}-2.JPG"]

    for pattern in dorsal_patterns:
        if pattern in all_image_files:
            original_filename = all_image_files[pattern]
            dorsal_path = os.path.join(IMAGE_DIR, original_filename)
            break
    for pattern in ventral_patterns:
        if pattern in all_image_files:
            original_filename = all_image_files[pattern]
            ventral_path = os.path.join(IMAGE_DIR, original_filename)
            break

    if dorsal_path and ventral_path:
        final_data_list.append({
            'image_path': dorsal_path,
            'true_label': row['Species of Tick'],
            'sample_id': row['Sample ID'],
            'view': 'dorsal',
            'sex': row.get('Tick Sex1'),
            'life_stage': row.get('Life Stage'),
            'attached': row.get('Attached?')
        })
        final_data_list.append({
            'image_path': ventral_path,
            'true_label': row['Species of Tick'],
            'sample_id': row['Sample ID'],
            'view': 'ventral',
            'sex': row.get('Tick Sex1'),
            'life_stage': row.get('Life Stage'),
            'attached': row.get('Attached?')
        })
        class_names_set.add(row['Species of Tick'])
    else:
        missing_ids.append(row['Sample ID'])

print(f"Created {len(final_data_list)} total entries for the JSON file.")
if missing_ids:
    print(f"Missing images for {len(missing_ids)} specimens: {missing_ids}")


Created 738 total entries for the JSON file.
Missing images for 1 specimens: ['246-01']


In [6]:
# --- 6. Save the JSON File ---
output_file = os.path.join(os.path.dirname(OUTPUT_JSON_PATH), "final_data.json")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w') as f:
    json.dump(final_data_list, f, indent=4)

print(f"Success! Data manifest saved to: {output_file}")

# Convert the class names set to a sorted list for consistent order
final_class_names = sorted(list(class_names_set))
with open(CLASS_NAMES_PATH, 'w') as f:
    json.dump(final_class_names, f, indent=4)

print(f"Success! Found {len(final_class_names)} unique class names and saved them to: {CLASS_NAMES_PATH}")

Success! Data manifest saved to: ../data/processed/final_data.json
Success! Found 6 unique class names and saved them to: ../data/processed/class_names.json


## Getting number of Specimens in each resource. Checks for duplicates and missing datapoints.


In [7]:
# --- Step 1: Get all Sample IDs from the CSV ---
# This is our "source of truth".
df = pd.read_csv(METADATA_PATH)
csv_ids = set(df['Sample ID'].astype(str))
print(f"Found {len(csv_ids)} unique Sample IDs in the CSV file.")

# --- Step 2: Get all base IDs from the image filenames ---
# This is what's actually in our image folder.
image_base_ids = set()
for filename in os.listdir(IMAGE_DIR):
    # We split "ZOE-0013-09-01.JPG" and take the first 3 parts
    parts = filename.split('-')
    if len(parts) >= 3:
        # Rejoin to form the base ID, e.g., "ZOE-0013-09"
        base_id = '-'.join(parts[:-1]) # Takes all parts except the last one
        image_base_ids.add(base_id)

print(f"Found {len(image_base_ids)} unique base IDs from the image filenames.")

# --- Step 3: Find the Mismatches ---
# This is the most important part. We find what's in one list but not the other.

missing_from_images = csv_ids.difference(image_base_ids)
missing_from_csv = image_base_ids.difference(csv_ids)

print("\n--- DIAGNOSTIC RESULTS ---\n")

if not missing_from_images:
    print("✅ All Sample IDs from the CSV have a corresponding image base ID.")
else:
    print(f"🚨 Found {len(missing_from_images)} IDs in the CSV that have NO matching image base ID:")
    # Print the first 10 examples so we can inspect them
    print(list(missing_from_images)[:10])

print("-" * 30)

if not missing_from_csv:
    print("✅ All image base IDs have a corresponding entry in the CSV.")
else:
    print(f"🚨 Found {len(missing_from_csv)} image base IDs that are NOT in the CSV:")
    # Print the first 10 examples
    print(list(missing_from_csv)[:])

Found 374 unique Sample IDs in the CSV file.
Found 530 unique base IDs from the image filenames.

--- DIAGNOSTIC RESULTS ---

🚨 Found 4 IDs in the CSV that have NO matching image base ID:
['ZOE-0070-01', 'ZOE-0091-01', 'ZOE-0082-07', 'ZOE-0086-01']
------------------------------
🚨 Found 160 image base IDs that are NOT in the CSV:
['46-05', '41-02', '70-01', '43-04', '53-01', '43-09', '30-02', '53-03', '47-03', '37-03', '38-01', '55-04', '38-03', '67-02', '69-02', '286-00', '54-03', 'ZOE-0023-01', '42-01', '54-09', '36-01', '50-01', '29-01', '56-06', '43-16', '55-09', '31-02', '66-08', '43-06', '66-05', '41-05', '66-03', '53-04', '46-03', '66-02', '56-02', '37-07', '46-01', '43-07', '45-04', '59-02', '61-02', '28-01', '366-1', '55-06', '52-02', '34-02', '33-01', '55-10', '45-03', '362-01', '35-02', '54-02', '32-01', '372-01', '61-03', 'ZOE-0086-02', '53-05', '49-02', '31-03', '55-01', '55-05', '37-06', '365-02', '56-03', '50-02', '66-04', '47-02', '54-11', '47-01', '43-02', '51-01', '56

## Checking how many Specimens have only 1 image

In [8]:
# We'll reuse the 'image_base_ids' logic from our diagnostic script,
# but this time we'll count the occurrences.
from collections import Counter

image_id_counts = []
for filename in os.listdir(IMAGE_DIR):
    parts = filename.split('-')
    if len(parts) >= 3:
        base_id = '-'.join(parts[:-1])
        image_id_counts.append(base_id)

# Count how many times each base_id appears
id_counts = Counter(image_id_counts)

# Now, find the IDs that don't have exactly 2 images
specimens_with_one_image = {id: count for id, count in id_counts.items() if count == 1}
specimens_with_more_than_two = {id: count for id, count in id_counts.items() if count > 2}


print("--- IMAGE COUNT ANALYSIS ---")
print(f"Total unique base IDs found: {len(id_counts)}")

if specimens_with_one_image:
    print(f"\n Found {len(specimens_with_one_image)} specimens with only ONE image:")
    print(specimens_with_one_image)
else:
    print("\n All specimens appear to have at least two images.")

if specimens_with_more_than_two:
    print(f"\n Found {len(specimens_with_more_than_two)} specimens with MORE than two images:")
    print(specimens_with_more_than_two)

--- IMAGE COUNT ANALYSIS ---
Total unique base IDs found: 530

 Found 4 specimens with only ONE image:
{'286-00': 1, '366-1': 1, '246-01': 1, '33-01': 1}


## Matching the images with the available data and building the JSON

In [9]:
# final_data_list = []
# records_with_missing_images = []

# # Iterate through each row of the metadata DataFrame.
# for index, row in df.iterrows():
#     # Get the complete identifier directly from the 'Sample ID' column.
#     base_id = row['Sample ID']

#     # Construct the expected dorsal and ventral filenames (converted to uppercase for matching).
#     dorsal_filename = f"{base_id}-01.JPG" or f"{base_id}-1.JPG"
#     ventral_filename = f"{base_id}-02.JPG"

#     # Check if both uppercase filenames exist in our set of actual filenames.
#     if dorsal_filename in all_image_filenames and ventral_filename in all_image_filenames:
        
#         # Create the dictionary for the DORSAL image.
#         dorsal_entry = {
#             'image_path': os.path.join(IMAGE_DIR, dorsal_filename),
#             'true_label': row['Species of Tick'],
#             'sample_id': row['Sample ID'],
#             'view': 'dorsal',
#             'sex': row.get('Sex'),
#             'life_stage': row.get('Life Stage'),
#             'attached': row.get('Attached?')
#         }
#         final_data_list.append(dorsal_entry)

#         # Create the dictionary for the VENTRAL image.
#         ventral_entry = {
#             'image_path': os.path.join(IMAGE_DIR, ventral_filename),
#             'true_label': row['Species of Tick'],
#             'sample_id': row['Sample ID'],
#             'view': 'ventral',
#             'sex': row.get('Sex'),
#             'life_stage': row.get('Life Stage'),
#             'attached': row.get('Attached?')
#         }
#         final_data_list.append(ventral_entry)
#     else:
#         # If one or both images are missing, log the record for review.
#         records_with_missing_images.append(base_id)

# print(f"Successfully created {len(final_data_list)} data entries for the JSON file.")
# if records_with_missing_images:
#     print(f"\nWarning: Could not find image pairs for the following {len(records_with_missing_images)} records:")
#     print(records_with_missing_images)