___

In [None]:
"""
Python 3, using Conda env: orthanc
@Author : Hasan Shaikh
@Email  : hasanshaikh3198@gmail.com
@GitHub : https://github.com/hash123shaikh
"""

---

### **Cell 1: Introduction**

In [None]:
"""
# --------------------------------------------------- DICOM RT Structure Name Analysis Tool ---------------------------------------------------

## Overview

This notebook performs a comprehensive analysis of RT structure nomenclature across 
the MAASTRO head & neck cancer dataset. It systematically extracts and catalogs structure 
names from RTSTRUCT DICOM files to establish naming conventions used in the dataset.

## Clinical Context

Radiation therapy structure sets (RTSTRUCT) contain contoured anatomical structures 
and treatment volumes. Naming conventions for these structures vary significantly 
across institutions, time periods, and clinical protocols. Common variations include:

- Gross Tumor Volume      :   GTV, GTV-1, GTV-Primary, gtv
- Clinical Target Volume  :   CTV, CTV-1, CTV1
- Planning Target Volume  :   PTV, PTV-1, PTV1

## Objectives

1. Systematically scan all patient directories in the dataset
2. Extract RT structure names from RTSTRUCT DICOM files
3. Generate frequency distributions for structure nomenclature
4. Provide data-driven recommendations for structure selection algorithms
5. Identify data quality issues and missing files

## Prerequisites

- Python 3.7+
- pydicom (DICOM file parsing)
- dcmrtstruct2nii (RT structure extraction)
- Dataset organized in standardized directory structure
"""

---

### **Cell 2: Import Required Libraries**

In [None]:
"""
Module Imports and Compatibility Configuration

This cell imports necessary Python libraries for file system operations,
DICOM file parsing, and RT structure extraction. A compatibility patch
ensures backward compatibility with both legacy and current pydicom versions.

Libraries:

- os                       : Operating system interface for path operations
- glob                     : Unix-style pathname pattern expansion
- pydicom                  : DICOM standard implementation for medical imaging
- dcmrtstruct2nii          : Specialized library for RT structure extraction
- collections.defaultdict  : Efficient counting and aggregation
"""

In [None]:
import os
import glob
import pydicom
from collections import defaultdict

# Backward compatibility patch for pydicom API changes
# Versions >=1.0 renamed read_file() to dcmread()
if not hasattr(pydicom, "read_file"):
    pydicom.read_file = pydicom.dcmread

from dcmrtstruct2nii import list_rt_structs

print("Status: All required libraries imported successfully")

---

### **Cell 3: Dataset Configuration**

In [None]:
"""
Dataset Path Configuration

Define the root directory containing the MAASTRO dataset. The expected 
directory structure follows standardized medical imaging organization:

Expected Structure:
  MAASTRO Dataset/
  ├── HN1-001/
  │   ├── RTSTRUCT/          # RT structure set DICOM files
  │   ├── CT/                # CT imaging series
  ├── HN1-002/
  └── ...

Configuration Notes:
- Relative paths are resolved from the notebook's working directory
- Absolute paths provide more robust file system references
- Update base_dir to match your local directory structure
"""

In [None]:
# Dataset root directory - modify as needed
base_dir = "./MAASTRO Dataset/Images"

print("="*70)
print("DATASET CONFIGURATION")
print("="*70)
print(f"\nConfigured dataset path: {os.path.abspath(base_dir)}")
print(f"Current working directory: {os.getcwd()}")
print("\nNote: Verify base_dir matches your local directory structure")

---

### **Cell 4: Dataset Path Validation**

In [None]:
"""
Dataset Accessibility Verification

Performs preliminary validation to ensure:
1. The specified dataset path exists on the file system
2. The directory contains accessible subdirectories (patient folders)
3. Basic dataset structure conforms to expected organization

This validation step prevents downstream errors by failing early with
actionable error messages if the dataset cannot be accessed.
"""

In [None]:
print("="*70)
print("DATASET PATH VALIDATION")
print("="*70 + "\n")

# Verify dataset directory exists
if not os.path.exists(base_dir):
    error_msg = f"""
    Dataset path not found: {os.path.abspath(base_dir)}
    
    Verification checklist:
    1. Confirm dataset directory name matches exactly (case-sensitive on Unix)
    2. Verify current working directory: {os.getcwd()}
    3. Consider using absolute path for unambiguous reference
    4. Check file system permissions allow read access
    """
    print(error_msg)
    raise FileNotFoundError(f"Dataset not accessible at: {base_dir}")

print("Status: Dataset path validated successfully")

# Enumerate patient directories
patient_folders = sorted([
    d for d in os.listdir(base_dir) 
    if os.path.isdir(os.path.join(base_dir, d))
])

print(f"Patient directories identified: {len(patient_folders)}")

if len(patient_folders) > 0:
    print(f"\nSample patient IDs (first 5): {patient_folders[:5]}")
    if len(patient_folders) > 5:
        print(f"Additional patients: {len(patient_folders) - 5}")
else:
    print("Warning: No patient subdirectories detected in dataset path")

---

### **Cell 5: RT Structure Extraction and Analysis**

In [None]:
"""
Primary Data Collection: RT Structure Name Extraction

This cell implements the core analysis algorithm:

Algorithm Overview:
1. Iterate through all patient directories in the dataset
2. Locate RTSTRUCT subdirectory (case-insensitive search)
3. Identify DICOM files within RTSTRUCT directory
4. Extract structure names using dcmrtstruct2nii library
5. Aggregate structure frequency statistics
6. Classify processing outcomes (success/failure/missing data)

Data Structures:
- structure_count: Dictionary mapping structure names to occurrence frequency
- patients_with_structures: List of successfully processed (patient_id, structures) tuples
- patients_with_errors: List of (patient_id, error_message) for failed cases
- patients_without_rtstruct: List of patient IDs lacking RTSTRUCT directory
- patients_without_dcm: List of patient IDs with empty RTSTRUCT directory

Error Handling:
Exceptions during DICOM parsing are caught and logged, allowing the analysis
to continue despite individual file corruption or format issues.
"""

In [None]:
print("\n" + "="*70)
print("RT STRUCTURE EXTRACTION ANALYSIS")
print("="*70 + "\n")

# Initialize data collection structures
structure_count = defaultdict(int)
patients_with_structures = []
patients_with_errors = []
patients_without_rtstruct = []
patients_without_dcm = []

# Analysis configuration
total_patients = len(patient_folders)
print(f"Initiating analysis of {total_patients} patient records\n")
print("-"*100)

# Main processing loop
for i, patient_id in enumerate(patient_folders, 1):
    patient_path = os.path.join(base_dir, patient_id)
    
    # Step 1: Identify RTSTRUCT directory (case-insensitive)
    # Rationale: Different systems may use varying case conventions
    subdirs = [
        d for d in os.listdir(patient_path) 
        if os.path.isdir(os.path.join(patient_path, d))
    ]
    subdirs_lower = {d.lower(): d for d in subdirs}
    
    # Verify RTSTRUCT directory existence
    if "rtstruct" not in subdirs_lower:
        print(f"[{i:3d}] {patient_id}: RTSTRUCT directory not found")
        patients_without_rtstruct.append(patient_id)
        continue
    
    rtstruct_folder = os.path.join(patient_path, subdirs_lower["rtstruct"])
    
    # Step 2: Locate DICOM files
    dcm_files = glob.glob(os.path.join(rtstruct_folder, "*.dcm"))
    
    if not dcm_files:
        print(f"[{i:3d}] {patient_id}: No DICOM files in RTSTRUCT directory")
        patients_without_dcm.append(patient_id)
        continue
    
    # Step 3: Extract structure names
    try:
        # list_rt_structs() returns list of structure names from DICOM header
        structures = list_rt_structs(dcm_files[0])
        
        print(f"[{i:3d}] {patient_id}: {structures}")
        
        # Aggregate structure frequency
        for struct in structures:
            structure_count[struct] += 1
        
        patients_with_structures.append((patient_id, structures))
        
    except Exception as e:
        error_summary = str(e)[:80]  # Truncate for readability
        print(f"[{i:3d}] {patient_id}: Processing error - {error_summary}")
        patients_with_errors.append((patient_id, str(e)))

print("-"*100)
print(f"\nAnalysis complete: Processed {len(patients_with_structures)} patients successfully")


---

### **Cell 6: Statistical Summary and Data Quality Metrics**

In [None]:
"""
Descriptive Statistics and Data Quality Assessment

This cell generates comprehensive statistics on:
1. Overall processing success rates
2. Structure name frequency distributions
3. Data completeness metrics
4. Quality indicators

Metrics Interpretation:
- Success Rate         : Percentage of patients with successfully extracted structures
- Structure Frequency  : Number of patients containing each structure
- Coverage Percentage  : Proportion of successfully processed patients with each structure

A structure appearing in >80% of patients indicates high consistency and
reliability for use in automated processing pipelines. Structures with <20%
coverage may represent optional annotations or inconsistent naming.
"""

In [None]:
print("\n" + "="*70)
print("STATISTICAL SUMMARY")
print("="*70)

# Processing outcome metrics
print(f"\nDataset Processing Summary:")
print(f"  Total patient records: {total_patients}")
print(f"  Successfully processed: {len(patients_with_structures)} "
      f"({len(patients_with_structures)/total_patients*100:.1f}%)")
print(f"  Missing RTSTRUCT directory: {len(patients_without_rtstruct)}")
print(f"  Empty RTSTRUCT directory: {len(patients_without_dcm)}")
print(f"  Processing errors: {len(patients_with_errors)}")

# Structure frequency analysis
if structure_count:
    print(f"\nRT Structure Nomenclature Analysis:")
    print("-"*70)
    print(f"{'Structure Name':<40} {'Count':>8} {'Coverage':>12}")
    print("-"*70)
    
    # Sort by frequency (descending)
    sorted_structures = sorted(
        structure_count.items(), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    for struct_name, count in sorted_structures:
        coverage = (count / len(patients_with_structures)) * 100
        print(f"{struct_name:<40} {count:>8} {coverage:>11.1f}%")
    
    print("-"*70)
    print(f"{'Unique structure identifiers:':<40} {len(structure_count):>8}")
    
    # Interpretation guidelines
    print("\nInterpretation Guidelines:")
    print("  High coverage (>80%): Consistent structure, suitable for automated processing")
    print("  Medium coverage (20-80%): Variable structure, may require conditional logic")
    print("  Low coverage (<20%): Rare or optional structure, consider exclusion criteria")
else:
    print("\nWarning: No structure names extracted from dataset")

---

### **Cell 7: Error Analysis and Data Quality Report**

In [None]:
"""
Detailed Error Analysis

Provides comprehensive error reporting for patients that failed processing.
Understanding error patterns helps identify:
- Systematic data quality issues
- File format incompatibilities
- Corrupted or incomplete data
- Scanner-specific encoding problems

Common Error Categories:
1. Corrupted DICOM files (incomplete writes, transmission errors)
2. Non-standard DICOM encoding (vendor-specific extensions)
3. Missing required DICOM tags (incomplete RT structure sets)
4. File system permission issues
5. Unsupported DICOM versions or transfer syntaxes

Error Rate Thresholds:
- <5%: Acceptable data loss, individual patient exclusion recommended
- 5-15%: Moderate concern, investigate sample errors for patterns
- >15%: Significant data quality issue, comprehensive investigation required
"""

In [None]:
if patients_with_errors:
    print("\n" + "="*70)
    print("ERROR ANALYSIS")
    print("="*70 + "\n")
    
    # Display detailed error information (limited to first 10)
    display_limit = min(10, len(patients_with_errors))
    for patient_id, error in patients_with_errors[:display_limit]:
        print(f"  Patient: {patient_id}")
        print(f"  Error: {error[:120]}")  # Truncate long error messages
        print()
    
    if len(patients_with_errors) > display_limit:
        remaining = len(patients_with_errors) - display_limit
        print(f"  Additional {remaining} patients encountered errors (not displayed)")
    
    # Error rate assessment
    error_rate = (len(patients_with_errors) / total_patients) * 100
    
    print("\nData Quality Assessment:")
    if error_rate < 5:
        print(f"  Error rate: {error_rate:.1f}% - Within acceptable limits")
        print(f"  Recommendation: Exclude {len(patients_with_errors)} affected patients")
        print(f"  Impact on statistical power: Minimal")
    elif error_rate < 15:
        print(f"  Error rate: {error_rate:.1f}% - Moderate data quality concern")
        print(f"  Recommendation: Investigate error patterns before proceeding")
        print(f"  Consider: Manual review of sample error cases")
    else:
        print(f"  Error rate: {error_rate:.1f}% - Significant data quality issue")
        print(f"  Recommendation: Comprehensive data integrity investigation required")
        print(f"  Possible causes: Dataset corruption, incorrect path, format mismatch")
else:
    print("\nData Quality: All patients processed successfully without errors")

---

### **Cell 8: Automated Code Generation for Structure Selection** 

In [None]:
"""
Structure Selection Algorithm Recommendations

Based on the empirical analysis of structure nomenclature in the dataset,
this cell generates optimized structure selection code for integration into
batch processing pipelines.

Code Generation Logic:

Case 1: Single Consistent Structure Name
  - Dataset uses uniform nomenclature
  - Generates exact string matching algorithm
  - Highest reliability, recommended approach

Case 2: Multiple Structure Variants
  - Dataset contains naming variations (e.g., GTV, GTV-1, GTV-2)
  - Generates three algorithmic options:
    a) Exact match (most common variant)
    b) Flexible substring matching (accepts any variant)
    c) Priority-based selection (ordered preference list)

Case 3: No Target Structure Found
  - Target structure (default: GTV) not present in dataset
  - Lists available alternative structures
  - Provides guidance for alternative structure selection

Integration Instructions:
The generated code should replace the structure selection logic in your
main processing script (typically lines 68-73). Ensure proper error handling
is maintained to skip patients without the target structure.
"""

In [None]:
print("\n" + "="*70)
print("STRUCTURE SELECTION ALGORITHM RECOMMENDATIONS")
print("="*70)

# Identify GTV-related structures (primary target for tumor segmentation)
gtv_structures = {
    s: structure_count[s] 
    for s in structure_count.keys() 
    if "GTV" in s.upper()
}

if gtv_structures:
    print("\nGross Tumor Volume (GTV) Structures Identified:")
    print("-"*70)
    
    for gtv, count in sorted(gtv_structures.items(), 
                             key=lambda x: x[1], 
                             reverse=True):
        coverage = (count / len(patients_with_structures)) * 100
        print(f"  Structure: '{gtv}' | Frequency: {count} patients ({coverage:.1f}% coverage)")
    
    print("\n" + "="*70)
    print("GENERATED CODE FOR MAIN PROCESSING SCRIPT")
    print("="*70)
    print("\nIntegration: Replace structure selection logic (lines 68-73) with code below\n")
    
    # Case 1: Single structure variant
    if len(gtv_structures) == 1:
        gtv_name = list(gtv_structures.keys())[0]
        gtv_count = list(gtv_structures.values())[0]
        
        print(f"# Algorithm: Exact String Matching")
        print(f"# Rationale: Dataset exhibits uniform GTV nomenclature")
        print(f"# Target structure: '{gtv_name}' (present in {gtv_count} patients)")
        print(f"\nselected_structure = None")
        print(f"for s in structures:")
        print(f"    if s.upper() == '{gtv_name.upper()}':")
        print(f"        selected_structure = s")
        print(f"        break")
        print(f"\n# Error handling for missing structures")
        print(f"if not selected_structure:")
        print(f"    print(f'Warning: GTV structure not found for patient {{patient_id}}')")
        print(f"    continue  # Skip patient and proceed to next")
        
    # Case 2: Multiple structure variants
    else:
        most_common_gtv = max(gtv_structures.items(), key=lambda x: x[1])[0]
        
        print(f"# Multiple GTV nomenclature variants detected")
        print(f"# Select appropriate algorithm based on analysis requirements:\n")
        
        print(f"# ALGORITHM OPTION 1: Most Common Variant")
        print(f"# Target: '{most_common_gtv}' ({structure_count[most_common_gtv]} patients)")
        print(f"# Use case: Maximize consistency, acceptable to exclude variant cases")
        print("-"*70)
        print(f"selected_structure = None")
        print(f"for s in structures:")
        print(f"    if s.upper() == '{most_common_gtv.upper()}':")
        print(f"        selected_structure = s")
        print(f"        break")
        
        print(f"\n\n# ALGORITHM OPTION 2: Flexible Substring Matching")
        print(f"# Accepts: Any structure containing 'GTV' substring")
        print(f"# Use case: Maximize patient inclusion, heterogeneous acceptable")
        print("-"*70)
        print(f"selected_structure = None")
        print(f"for s in structures:")
        print(f"    if 'GTV' in s.upper():")
        print(f"        selected_structure = s")
        print(f"        break")
        
        print(f"\n\n# ALGORITHM OPTION 3: Priority-Based Selection")
        print(f"# Attempts structures in order of preference")
        print(f"# Use case: Balance between consistency and inclusion")
        print("-"*70)
        print(f"# Define priority order (most preferred first)")
        print(f"gtv_priority = {list(gtv_structures.keys())}")
        print(f"\nselected_structure = None")
        print(f"for preferred_gtv in gtv_priority:")
        print(f"    for s in structures:")
        print(f"        if s.upper() == preferred_gtv.upper():")
        print(f"            selected_structure = s")
        print(f"            break")
        print(f"    if selected_structure:")
        print(f"        break")
        
        print(f"\n# Error handling (applies to all options)")
        print("-"*70)
        print(f"if not selected_structure:")
        print(f"    print(f'Warning: No GTV structure found for patient {{patient_id}}')")
        print(f"    continue  # Skip patient")

else:
    # No GTV structures found - provide alternatives
    print("\nWarning: No Gross Tumor Volume (GTV) structures detected in dataset")
    print("\nPossible Explanations:")
    print("  1. Alternative nomenclature in use (verify structure name table)")
    print("  2. Case sensitivity mismatch (e.g., 'gtv' vs 'GTV')")
    print("  3. Dataset focuses on different anatomical structures")
    print("  4. RTSTRUCT files may not contain tumor volume annotations")
    
    if structure_count:
        print("\nAvailable RT Structures in Dataset:")
        print("-"*70)
        
        # Display top 10 most common structures
        sorted_structures = sorted(
            structure_count.items(),
            key=lambda x: x[1],
            reverse=True
        )[:10]
        
        for struct_name, count in sorted_structures:
            coverage = (count / len(patients_with_structures)) * 100
            print(f"  Structure: '{struct_name}' | Frequency: {count} ({coverage:.1f}%)")
        
        print("\nRecommendation:")
        print("  Review structure frequency table above")
        print("  Select appropriate alternative structure for analysis")
        print("  Modify code generation target from 'GTV' to chosen structure")

print("\n" + "="*70)
print("ANALYSIS WORKFLOW COMPLETE")
print("="*70)

---