In [None]:
pip install --upgrade pip



In [None]:
pip install pandas pillow lxml matplotlib torchvision torch tqdm



In [None]:
pip install grad-cam



In [None]:
!pip install pandas pillow lxml matplotlib tqdm torch torchvision --quiet

In [None]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch
import matplotlib.pyplot as plt

# ==== Set dataset paths ====
BASE_DIR = "/content/drive/MyDrive/FDL_Thyroid_Disease/Project/28455641/TN5000_forReview/TN5000_forReview"
ANN_DIR = os.path.join(BASE_DIR, "Annotations")
IMG_DIR = os.path.join(BASE_DIR, "JPEGImages")
SPLIT_DIR = os.path.join(BASE_DIR, "ImageSets", "Main")

print("✅ Dataset folders found:")
print("Annotations:", len(os.listdir(ANN_DIR)), "xml files")
print("Images:", len(os.listdir(IMG_DIR)), "jpg files")
print("Splits:", os.listdir(SPLIT_DIR))

✅ Dataset folders found:
Annotations: 5013 xml files
Images: 5000 jpg files
Splits: ['train.txt', 'val.txt', 'test.txt', 'trainval.txt']


# STEP 3: Parse XML Annotations → Create a Master CSV

Each .xml file contains bounding box coordinates and the label (0 = benign, 1 = malignant).
This step reads all XMLs and builds tn5000_annotations.csv.

In [None]:
# parse_voc_xmls_to_csv.py
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd
from PIL import Image

# -------- CONFIG --------
ANNOTATIONS_DIR = "/content/drive/MyDrive/FDL_Thyroid_Disease/Project/28455641/TN5000_forReview/TN5000_forReview/Annotations"
IMAGES_DIR      = "/content/drive/MyDrive/FDL_Thyroid_Disease/Project/28455641/TN5000_forReview/TN5000_forReview/JPEGImages"
OUT_CSV         = "tn5000_annotations.csv"
VALIDATE_BBOXES = True   # set False to skip image-size checks
# ------------------------

rows = []
xml_files = sorted(glob.glob(os.path.join(ANNOTATIONS_DIR, "*.xml")))
print(f"Found {len(xml_files)} XML files in {ANNOTATIONS_DIR}")

for xml_path in xml_files:
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
    except Exception as e:
        print(f"Warning: failed to parse {xml_path}: {e}")
        continue

    # filename as given in XML (e.g., 000001.jpg)
    filename_node = root.find("filename")
    if filename_node is None:
        print(f"Warning: no <filename> in {xml_path}, skipping")
        continue
    filename = filename_node.text.strip()

    # image size (if present)
    size_node = root.find("size")
    if size_node is not None:
        try:
            width  = int(size_node.find("width").text)
            height = int(size_node.find("height").text)
            depth  = int(size_node.find("depth").text) if size_node.find("depth") is not None else None
        except Exception:
            width = height = depth = None
    else:
        width = height = depth = None

    # If chosen, try to read actual image to validate sizes
    if VALIDATE_BBOXES:
        img_path = os.path.join(IMAGES_DIR, filename)
        if os.path.exists(img_path):
            try:
                with Image.open(img_path) as im:
                    real_w, real_h = im.size
                # if XML size absent or inconsistent, overwrite with real value
                if width is None or height is None or (width != real_w or height != real_h):
                    width, height = real_w, real_h
            except Exception as e:
                print(f"Warning: cannot open image {img_path}: {e}")

    # iterate objects (one row per object)
    object_nodes = root.findall("object")
    if not object_nodes:
        # If no object, write a row with NaNs for bbox
        rows.append([filename, None, width, height, depth, None, None, None, None, os.path.basename(xml_path)])
        continue

    for obj in object_nodes:
        name_node = obj.find("name")
        label_text = name_node.text.strip() if name_node is not None else None

        # Convert label_text to integer if possible (your XML shows '0'/'1')
        label = None
        if label_text is not None:
            try:
                label = int(label_text)
            except ValueError:
                label = label_text  # leave as string if cannot convert

        bnd = obj.find("bndbox")
        if bnd is None:
            xmin = ymin = xmax = ymax = None
        else:
            try:
                xmin = int(float(bnd.find("xmin").text))
                ymin = int(float(bnd.find("ymin").text))
                xmax = int(float(bnd.find("xmax").text))
                ymax = int(float(bnd.find("ymax").text))
            except Exception:
                xmin = ymin = xmax = ymax = None

        # clamp boxes to image bounds if we know width/height
        if VALIDATE_BBOXES and width is not None and height is not None and xmin is not None:
            xmin = max(0, min(xmin, width-1))
            xmax = max(0, min(xmax, width-1))
            ymin = max(0, min(ymin, height-1))
            ymax = max(0, min(ymax, height-1))

        rows.append([
            filename,           # image filename
            label,              # label (0/1 or string)
            width, height, depth,
            xmin, ymin, xmax, ymax,
            os.path.basename(xml_path)  # source xml file
        ])

# Build DataFrame
cols = ["image","label","img_width","img_height","img_depth","xmin","ymin","xmax","ymax","xml_file"]
df = pd.DataFrame(rows, columns=cols)

# basic sanity checks and reporting
print("\nParsed dataframe shape:", df.shape)
print("Label value counts (first 20):")
print(df['label'].value_counts(dropna=False).head(20))
print("\nSome sample rows:")
print(df.head(8))

# save CSV
df.to_csv(OUT_CSV, index=False)
print(f"\nSaved parsed annotations to {OUT_CSV}")

In [None]:
# step4_add_splits.py
import os
import pandas as pd

# --- Config (adjust if your folders are different) ---
BASE_DIR = "/content/drive/MyDrive/FDL_Thyroid_Disease/Project/28455641/TN5000_forReview/TN5000_forReview"
SPLIT_DIR = os.path.join(BASE_DIR, "ImageSets", "Main")
IMG_DIR = os.path.join(BASE_DIR, "JPEGImages")
IN_CSV = "tn5000_annotations.csv"
OUT_CSV = "tn5000_annotations_split.csv"
# -----------------------------------------------------

# 1) load annotations CSV
df = pd.read_csv(IN_CSV)
print(f"Loaded annotations: {len(df)} rows from {IN_CSV}")

# 2) helper to read id lists and append .jpg
def read_ids(path):
    if not os.path.exists(path):
        print(f"Warning: split file not found: {path}")
        return set()
    with open(path, "r") as f:
        lines = [ln.strip() for ln in f if ln.strip()]
    # convert ids to filenames as used in annotations (add .jpg)
    return set([ln + ".jpg" for ln in lines])

train_ids = read_ids(os.path.join(SPLIT_DIR, "train.txt"))
val_ids   = read_ids(os.path.join(SPLIT_DIR, "val.txt"))
test_ids  = read_ids(os.path.join(SPLIT_DIR, "test.txt"))
print(f"Split sizes found — train: {len(train_ids)}, val: {len(val_ids)}, test: {len(test_ids)}")

# 3) map each image -> split
def map_split(img_name):
    if img_name in train_ids: return "train"
    if img_name in val_ids:   return "val"
    if img_name in test_ids:  return "test"
    return "unknown"

df["split"] = df["image"].apply(map_split)

# 4) sanity check: any unknown images?
unknown_count = (df["split"] == "unknown").sum()
if unknown_count > 0:
    print(f"Note: {unknown_count} annotation rows have image not present in train/val/test splits (marked 'unknown').")
    # optional: list a few examples
    print(df[df["split"]=="unknown"]["image"].drop_duplicates().head(10).tolist())

# 5) validate image files exist
missing_images = []
for img in df["image"].unique():
    if not os.path.exists(os.path.join(IMG_DIR, img)):
        missing_images.append(img)
if missing_images:
    print(f"Warning: {len(missing_images)} images referenced in CSV are missing in {IMG_DIR}. Example(s): {missing_images[:5]}")
else:
    print("All referenced images exist in JPEGImages folder.")

# 6) print per-split counts (rows and unique images)
print("\nAnnotation counts by split (rows):")
print(df["split"].value_counts())

print("\nUnique image counts by split:")
print(df.groupby("split")["image"].nunique())

# 7) per-split class balance summary
print("\nPer-split label distribution (label counts):")
for s in ["train","val","test","unknown"]:
    if s in df["split"].values:
        sub = df[df["split"]==s]
        print(f" {s}: {sub['label'].value_counts().to_dict()}  (total rows: {len(sub)})")

# 8) save output CSV
df.to_csv(OUT_CSV, index=False)
print(f"\nSaved updated CSV with splits to: {OUT_CSV}")


Loaded annotations: 5013 rows from tn5000_annotations.csv
Split sizes found — train: 3500, val: 500, test: 1000
All referenced images exist in JPEGImages folder.

Annotation counts by split (rows):
split
train    3508
test     1004
val       501
Name: count, dtype: int64

Unique image counts by split:
split
test     1000
train    3500
val       500
Name: image, dtype: int64

Per-split label distribution (label counts):
 train: {1: 2473, 0: 1035}  (total rows: 3508)
 val: {1: 376, 0: 125}  (total rows: 501)
 test: {1: 733, 0: 271}  (total rows: 1004)

Saved updated CSV with splits to: tn5000_annotations_split.csv


In [None]:
!ls -lh /content | grep tn5000

-rw-r--r-- 1 root root 244K Oct 15 13:23 tn5000_annotations.csv
-rw-r--r-- 1 root root 271K Oct 15 13:44 tn5000_annotations_split.csv


In [None]:
!cp /content/tn5000_annotations_split.csv "/content/drive/MyDrive/FDL_Thyroid_Disease/Project/"

Step 5: Cropping script you can run in Colab right now. It reads your tn5000_annotations_split.csv, crops each bounding box from the original images, and saves the cropped nodule patches into organized folders:

In [None]:
# Step 5: crop_nodules_and_save.py
import os, pandas as pd
from PIL import Image
from tqdm import tqdm

# ----------------- CONFIG -----------------
# Path in your Drive where you stored CSVs and original JPEGImages folder
DRIVE_PROJECT = "/content/drive/MyDrive/FDL_Thyroid_Disease/Project"
CSV_PATH      = os.path.join(DRIVE_PROJECT, "tn5000_annotations_split.csv")
IMG_DIR       = os.path.join(DRIVE_PROJECT, "28455641", "TN5000_forReview", "TN5000_forReview", "JPEGImages")
OUT_ROOT      = os.path.join(DRIVE_PROJECT, "TN5000_crops")   # output root (will create train/val/test)
# Optional: also save a resized version for quick experiments
SAVE_RESIZED = False
RESIZED_SIZE = (224,224)   # only used if SAVE_RESIZED=True

# If you've copied dataset to local /content for speed, use these instead:
# CSV_PATH = "/content/tn5000_annotations_split.csv"
# IMG_DIR  = "/content/TN5000_local/JPEGImages"
# OUT_ROOT = "/content/TN5000_crops"
# ------------------------------------------

os.makedirs(OUT_ROOT, exist_ok=True)
for s in ["train","val","test","unknown"]:
    os.makedirs(os.path.join(OUT_ROOT, s), exist_ok=True)
    if SAVE_RESIZED:
        os.makedirs(os.path.join(OUT_ROOT, f"{s}_resized"), exist_ok=True)

# ----------------- LOAD CSV -----------------
df = pd.read_csv(CSV_PATH)
print("Total annotation rows:", len(df))
print("Unique images:", df['image'].nunique())
print("Split counts (images):")
print(df.groupby('split')['image'].nunique())

# ----------------- Group by image and crop -----------------
# We'll create unique crop filenames when multiple boxes exist per image
grouped = df.groupby('image')

total_saved = 0
missing_images = []
crop_records = []  # to optionally save a manifest of crops

for img_name, sub in tqdm(grouped, desc="Processing images"):
    img_path = os.path.join(IMG_DIR, img_name)
    if not os.path.exists(img_path):
        missing_images.append(img_name)
        continue
    try:
        with Image.open(img_path) as im:
            im = im.convert("RGB")
            img_w, img_h = im.size
            # iterate objects for this image (preserves the row order)
            for i, (_, row) in enumerate(sub.reset_index(drop=True).iterrows(), start=1):
                xmin = int(row['xmin']) if not pd.isna(row['xmin']) else 0
                ymin = int(row['ymin']) if not pd.isna(row['ymin']) else 0
                xmax = int(row['xmax']) if not pd.isna(row['xmax']) else img_w-1
                ymax = int(row['ymax']) if not pd.isna(row['ymax']) else img_h-1
                # clamp
                xmin = max(0, min(xmin, img_w-1))
                ymin = max(0, min(ymin, img_h-1))
                xmax = max(0, min(xmax, img_w-1))
                ymax = max(0, min(ymax, img_h-1))
                # ensure non-zero box
                if xmax <= xmin or ymax <= ymin:
                    # skip invalid box
                    continue
                crop = im.crop((xmin, ymin, xmax, ymax))
                split = row['split'] if 'split' in row and pd.notna(row['split']) else 'unknown'
                # build unique filename: originalname_idx.jpg (idx for multiple bboxes)
                base, ext = os.path.splitext(img_name)
                out_name = f"{base}_{i}{ext}"
                out_path = os.path.join(OUT_ROOT, split, out_name)
                crop.save(out_path)
                total_saved += 1
                crop_records.append({
                    "orig_image": img_name,
                    "crop_name": out_name,
                    "split": split,
                    "label": row['label'],
                    "xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax,
                    "out_path": out_path
                })
                # optional resized save
                if SAVE_RESIZED:
                    resized = crop.resize(RESIZED_SIZE, Image.BILINEAR)
                    resized.save(os.path.join(OUT_ROOT, f"{split}_resized", out_name))
    except Exception as e:
        print(f"Warning: failed to process {img_path}: {e}")
        missing_images.append(img_name)

# ----------------- Summary -----------------
print("\nDONE.")
print("Total crops saved:", total_saved)
print("Missing/failed images:", len(missing_images))
if missing_images:
    print("Example missing images:", missing_images[:5])

# Save a small manifest CSV of all crops
manifest_df = pd.DataFrame(crop_records)
manifest_csv = os.path.join(DRIVE_PROJECT, "tn5000_crops_manifest.csv")
manifest_df.to_csv(manifest_csv, index=False)
print("Saved crop manifest:", manifest_csv)


Total annotation rows: 5013
Unique images: 5000
Split counts (images):
split
test     1000
train    3500
val       500
Name: image, dtype: int64


Processing images: 100%|██████████| 5000/5000 [01:47<00:00, 46.41it/s]



DONE.
Total crops saved: 5013
Missing/failed images: 0
Saved crop manifest: /content/drive/MyDrive/FDL_Thyroid_Disease/Project/tn5000_crops_manifest.csv
