# Deep Learning for Tuberculosis Detection and Classification: A YOLO-v8 + Convolutional Block Attention Module Approach

By: John Abuel



## Importing Libraries

In [8]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import shutil
from tqdm import tqdm
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 6)

In [9]:
class Config:
  
    TRAIN_JSON = "../dataset/raw_data/TBX11K/annotations/json/all_train.json"
    VAL_JSON   = "../dataset/raw_data/TBX11K/annotations/json/all_val.json"
    
    # Optional: For final deployment only (Combines train+val)
    # TRAINVAL_JSON = "../dataset/raw_data/TBX11K/annotations/json/all_trainval.json"

    # 2. IMAGE ROOT
    # Contains 'tb', 'health', 'sick' AND 'extra' folders
    RAW_IMAGES = "../dataset/raw_data/TBX11K/imgs" 
    
    # 3. OUTPUT PATHS
    OUTPUT_BASE = "../dataset/yolo_official"
    
    # Parameters
    MIN_BOX_AREA = 50       
    TARGET_SIZE = (512, 512)

config = Config()

## Loading Data

In [10]:
def load_data(json_path):
    print(f"Loading annotations from {json_path}...")
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    images = {}
    for img in data['images']:
        images[img['id']] = img
        images[img['id']]['annotations'] = []
        
    for ann in data['annotations']:
        if ann['image_id'] in images:
            # Map Category Name for convenience
            cat_id = ann['category_id']
            # TBX11K IDs: 1=Active, 2=Latent, 3=Uncertain
            if cat_id == 1: name = 'ActiveTuberculosis'
            elif cat_id == 2: name = 'ObsoletePulmonaryTuberculosis'
            else: name = 'Uncertain'
            
            ann['category_name'] = name
            images[ann['image_id']]['annotations'].append(ann)
            
    return images

## Data Cleaning

In [12]:
def clean_dataset(images_data, config):
    print("\n" + "="*80)
    print("DATA CLEANING")
    print("="*80)
    
    cleaned_data = {}
    removed_count = 0
    
    for img_id, img_info in images_data.items():
        src_path = os.path.join(config.RAW_IMAGES, img_info['file_name'])
        
        # 1. Check if file exists
        if not os.path.exists(src_path):
            removed_count += 1
            continue

        # 2. Filter Invalid Boxes
        original_ann_count = len(img_info['annotations'])
        valid_annotations = []
        
        for ann in img_info['annotations']:
            bbox = ann['bbox'] # [x, y, w, h]
            # Ensure box has positive area and meets threshold
            if bbox[2] > 0 and bbox[3] > 0:
                if (bbox[2] * bbox[3]) >= config.MIN_BOX_AREA:
                    valid_annotations.append(ann)
        
        # 3. LOGIC FIX:
        # If image HAD boxes (was TB), but we filtered them all out (too small),
        # DROP the image. Do not label a TB patient as "Healthy" just because boxes were small.
        if original_ann_count > 0 and len(valid_annotations) == 0:
            removed_count += 1
            continue 

        # If it was Healthy (0 boxes) or has valid boxes, keep it.
        img_info['annotations'] = valid_annotations
        cleaned_data[img_id] = img_info

    print(f"✓ Original dataset: {len(images_data)} images")
    print(f"✓ Cleaned dataset: {len(cleaned_data)} images")
    print(f"✓ Removed: {removed_count} images (missing files or invalid boxes)")
    
    return cleaned_data

## Data Conversion

In [13]:
def get_class_id(category_name):
    """Map categories to YOLO IDs: Active=0, Latent=1"""
    name = str(category_name).lower()
    if 'active' in name: return 0
    elif 'obsolete' in name or 'latent' in name: return 1
    return -1 # Skip

def convert_to_yolo_format(bbox, img_w, img_h):
    # COCO [x, y, w, h] -> YOLO [cx, cy, w, h] normalized
    x, y, w, h = bbox
    cx = (x + w/2) / img_w
    cy = (y + h/2) / img_h
    nw = w / img_w
    nh = h / img_h
    return [max(0, min(1, val)) for val in [cx, cy, nw, nh]]

def process_json_file(json_path, split_name, config):
    """
    Loads an official JSON file, cleans it, and writes images/labels 
    to the specific split folder (train/val).
    """
    if not os.path.exists(json_path):
        print(f"⚠️  Skipping {split_name}: File not found at {json_path}")
        return

    print(f"\n" + "="*60)
    print(f"PROCESSING SPLIT: {split_name.upper()}")
    print(f"Source: {os.path.basename(json_path)}")
    print("="*60)

    # 1. Setup Output Directories
    img_dir = os.path.join(config.OUTPUT_BASE, "images", split_name)
    lbl_dir = os.path.join(config.OUTPUT_BASE, "labels", split_name)
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(lbl_dir, exist_ok=True)

    # 2. Load Data
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Map Images
    images_lookup = {img['id']: img for img in data['images']}
    
    # Map Annotations
    anns_lookup = {}
    for ann in data['annotations']:
        img_id = ann['image_id']
        if img_id not in anns_lookup: anns_lookup[img_id] = []
        
        # Inject Category Name if missing
        if 'category_name' not in ann:
            cid = ann['category_id']
            if cid == 1: ann['category_name'] = 'ActiveTuberculosis'
            elif cid == 2: ann['category_name'] = 'ObsoletePulmonaryTuberculosis'
            else: ann['category_name'] = 'Unknown'
            
        anns_lookup[img_id].append(ann)

    # 3. Process Images
    processed_count = 0
    missing_files = 0
    
    for img_id, img_info in tqdm(images_lookup.items(), desc=f"Converting {split_name}"):
        # Construct path (handles 'extra/Shenzhen/...' automatically)
        src_path = os.path.join(config.RAW_IMAGES, img_info['file_name'])
        
        # Validation: Check file existence
        if not os.path.exists(src_path):
            missing_files += 1
            continue

        # Flatten Filename for YOLO structure
        flat_name = os.path.basename(img_info['file_name'])
        dst_img_path = os.path.join(img_dir, flat_name)
        dst_lbl_path = os.path.join(lbl_dir, os.path.splitext(flat_name)[0] + ".txt")

        # Read & Resize Image
        img = cv2.imread(src_path)
        if img is None: 
            missing_files += 1
            continue
            
        h_orig, w_orig = img.shape[:2]
        img_resized = cv2.resize(img, config.TARGET_SIZE)
        cv2.imwrite(dst_img_path, img_resized)

        # Write Labels
        valid_box_count = 0
        with open(dst_lbl_path, 'w') as f:
            if img_id in anns_lookup:
                for ann in anns_lookup[img_id]:
                    # Filter: Tiny Boxes (Noise)
                    bbox = ann['bbox']
                    if bbox[2] * bbox[3] < config.MIN_BOX_AREA: 
                        continue
                    
                    # Filter: Class Mapping
                    cls_id = get_class_id(ann['category_name'])
                    if cls_id == -1: 
                        continue 
                    
                    # Write
                    yolo_box = convert_to_yolo_format(bbox, w_orig, h_orig)
                    f.write(f"{cls_id} {' '.join(map(str, yolo_box))}\n")
                    valid_box_count += 1
        
        processed_count += 1

    print(f"✓ Finished {split_name}: {processed_count} images processed.")
    if missing_files > 0:
        print(f"  Warning: {missing_files} images were missing from disk.")

In [14]:
if __name__ == "__main__":
    # 1. Process Official Train Set
    process_json_file(config.TRAIN_JSON, "train", config)
    
    # 2. Process Official Val Set
    process_json_file(config.VAL_JSON, "val", config)
    
    # NOTE: We do NOT process 'all_test.json' because it has no labels. 
    # YOLO cannot train on images without .txt files.
    
    print("\n SUCCESS! Dataset is ready at:", config.OUTPUT_BASE)


PROCESSING SPLIT: TRAIN
Source: all_train.json


Converting train: 100%|██████████| 6888/6888 [01:31<00:00, 75.10it/s] 


✓ Finished train: 6888 images processed.

PROCESSING SPLIT: VAL
Source: all_val.json


Converting val: 100%|██████████| 2088/2088 [00:28<00:00, 73.83it/s] 

✓ Finished val: 2088 images processed.

 SUCCESS! Dataset is ready at: ../dataset/yolo_official



