# Deep Learning for Tuberculosis Detection and Classification: A YOLO-v8 + Convolutional Block Attention Module Approach

By: John Abuel



## Importing Libraries

In [1]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import shutil
from tqdm import tqdm
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 6)

In [2]:
class Config:
    # Input Paths
    RAW_IMAGES = "../dataset/raw_data/TBX11K/imgs" 
    ANNOTATION_FILE = "../dataset/raw_data/TBX11K/annotations/json/TBX11K_train.json"
    
    # Output Paths
    OUTPUT_BASE = "../dataset/yolo_ready"
    TRAIN_IMAGES = os.path.join(OUTPUT_BASE, "images/train")
    VAL_IMAGES = os.path.join(OUTPUT_BASE, "images/val")
    TEST_IMAGES = os.path.join(OUTPUT_BASE, "images/test")
    TRAIN_LABELS = os.path.join(OUTPUT_BASE, "labels/train")
    VAL_LABELS = os.path.join(OUTPUT_BASE, "labels/val")
    TEST_LABELS = os.path.join(OUTPUT_BASE, "labels/test")

    # Parameters
    MIN_BOX_AREA = 50       # Filter tiny noise boxes
    TRAIN_RATIO = 0.8
    VAL_RATIO = 0.2         # (Test will take the remainder if any, or 0)
    TARGET_SIZE = (512, 512) # Resize for YOLOv8

config = Config()

## Loading Data

In [3]:
def load_data(json_path):
    print(f"Loading annotations from {json_path}...")
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    images = {}
    for img in data['images']:
        images[img['id']] = img
        images[img['id']]['annotations'] = []
        
    for ann in data['annotations']:
        if ann['image_id'] in images:
            # Map Category Name for convenience
            cat_id = ann['category_id']
            # TBX11K IDs: 1=Active, 2=Latent, 3=Uncertain
            if cat_id == 1: name = 'ActiveTuberculosis'
            elif cat_id == 2: name = 'ObsoletePulmonaryTuberculosis'
            else: name = 'Uncertain'
            
            ann['category_name'] = name
            images[ann['image_id']]['annotations'].append(ann)
            
    return images

## Data Cleaning

In [4]:
def clean_dataset(images_data, config):
    print("\n" + "="*80)
    print("DATA CLEANING")
    print("="*80)
    
    cleaned_data = {}
    removed_count = 0
    
    for img_id, img_info in images_data.items():
        src_path = os.path.join(config.RAW_IMAGES, img_info['file_name'])
        
        # 1. Check if file exists
        if not os.path.exists(src_path):
            removed_count += 1
            continue

        # 2. Filter Invalid Boxes
        original_ann_count = len(img_info['annotations'])
        valid_annotations = []
        
        for ann in img_info['annotations']:
            bbox = ann['bbox'] # [x, y, w, h]
            # Ensure box has positive area and meets threshold
            if bbox[2] > 0 and bbox[3] > 0:
                if (bbox[2] * bbox[3]) >= config.MIN_BOX_AREA:
                    valid_annotations.append(ann)
        
        # 3. LOGIC FIX:
        # If image HAD boxes (was TB), but we filtered them all out (too small),
        # DROP the image. Do not label a TB patient as "Healthy" just because boxes were small.
        if original_ann_count > 0 and len(valid_annotations) == 0:
            removed_count += 1
            continue 

        # If it was Healthy (0 boxes) or has valid boxes, keep it.
        img_info['annotations'] = valid_annotations
        cleaned_data[img_id] = img_info

    print(f"✓ Original dataset: {len(images_data)} images")
    print(f"✓ Cleaned dataset: {len(cleaned_data)} images")
    print(f"✓ Removed: {removed_count} images (missing files or invalid boxes)")
    
    return cleaned_data

## Split the dataset for training and validation

In [5]:
def split_dataset(images_data, config):
    print("\n" + "="*80)
    print("DATASET SPLITTING")
    print("="*80)
    
    # Separate images with and without TB
    with_tb = []
    without_tb = []
    
    for img_id, img_info in images_data.items():
        if len(img_info['annotations']) > 0:
            with_tb.append(img_info)
        else:
            without_tb.append(img_info)
            
    # Shuffle
    np.random.seed(42)
    np.random.shuffle(with_tb)
    np.random.shuffle(without_tb)
    
    # Calculate Split Indices
    def get_split_indices(data_list):
        n = len(data_list)
        train_end = int(n * config.TRAIN_RATIO)
        # If val ratio fills the rest, test is empty (typical for this dataset structure)
        return data_list[:train_end], data_list[train_end:]

    train_tb, val_tb = get_split_indices(with_tb)
    train_no_tb, val_no_tb = get_split_indices(without_tb)
    
    # Combine
    splits = {
        'train': train_tb + train_no_tb,
        'val': val_tb + val_no_tb,
        'test': [] # Optional, can add back if needed
    }
    
    print(f"✓ Train: {len(splits['train'])} images")
    print(f"✓ Val:   {len(splits['val'])} images")
    
    return splits

## Data Conversion

In [6]:
def get_class_id(category_name):
    """Map categories to YOLO IDs: Active=0, Latent=1"""
    name = category_name.lower()
    if 'active' in name: return 0
    elif 'obsolete' in name or 'latent' in name: return 1
    return -1 # Skip

def convert_to_yolo_format(bbox, img_w, img_h):
    # Convert COCO (x, y, w, h) to YOLO (cx, cy, w, h) normalized
    x, y, w, h = bbox
    cx = (x + w/2) / img_w
    cy = (y + h/2) / img_h
    nw = w / img_w
    nh = h / img_h
    return [max(0, min(1, val)) for val in [cx, cy, nw, nh]]

def process_and_save_split(split_data, split_name, config):
    print(f"\nProcessing {split_name} split...")
    
    # Map split name to folders
    if split_name == 'train':
        img_dir, lbl_dir = config.TRAIN_IMAGES, config.TRAIN_LABELS
    else:
        img_dir, lbl_dir = config.VAL_IMAGES, config.VAL_LABELS
        
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(lbl_dir, exist_ok=True)

    for img_info in tqdm(split_data):
        src_path = os.path.join(config.RAW_IMAGES, img_info['file_name'])
        
        # FIX 1: FLATTEN FILENAME
        # Converts 'tb/tb001.png' -> 'tb001.png' to prevent subdirectory errors
        flat_name = os.path.basename(img_info['file_name'])
        dst_img_path = os.path.join(img_dir, flat_name)
        
        # Read Image
        img = cv2.imread(src_path)
        if img is None: continue
        
        h_orig, w_orig = img.shape[:2]
        
        # FIX 2: RESIZE
        # Resize to 512x512 now to save massive space/time later
        img_resized = cv2.resize(img, config.TARGET_SIZE)
        cv2.imwrite(dst_img_path, img_resized)
        
        # Create Label File
        txt_name = os.path.splitext(flat_name)[0] + ".txt"
        txt_path = os.path.join(lbl_dir, txt_name)
        
        with open(txt_path, 'w') as f:
            for ann in img_info['annotations']:
                # FIX 3: CLASS MAPPING
                cls_id = get_class_id(ann['category_name'])
                if cls_id == -1: continue # Skip uncertain TB
                
                # Convert coords using ORIGINAL dimensions
                yolo_box = convert_to_yolo_format(ann['bbox'], w_orig, h_orig)
                f.write(f"{cls_id} {' '.join(map(str, yolo_box))}\n")

In [7]:
if __name__ == "__main__":
    # 1. Load
    raw_data = load_data(config.ANNOTATION_FILE)
    
    # 2. Clean (Cell 7 logic)
    cleaned_data = clean_dataset(raw_data, config)
    
    # 3. Split (Cell 8 logic)
    splits = split_dataset(cleaned_data, config)
    
    # 4. Process (Cell 9 logic)
    for split in ['train', 'val']:
        process_and_save_split(splits[split], split, config)
        
    print("\n✓ SUCCESS! Dataset is ready at: dataset/yolo_ready")

Loading annotations from ../dataset/raw_data/TBX11K/annotations/json/TBX11K_train.json...

DATA CLEANING
✓ Original dataset: 6600 images
✓ Cleaned dataset: 6600 images
✓ Removed: 0 images (missing files or invalid boxes)

DATASET SPLITTING
✓ Train: 5279 images
✓ Val:   1321 images

Processing train split...


100%|██████████| 5279/5279 [01:14<00:00, 71.20it/s]



Processing val split...


100%|██████████| 1321/1321 [00:17<00:00, 74.91it/s]


✓ SUCCESS! Dataset is ready at: dataset/yolo_ready



