# RTpipeline - Part 2: CPU Analysis

**Radiotherapy DICOM Processing Pipeline - Colab Edition**

This notebook runs CPU-intensive analysis tasks:
- DVH (Dose-Volume Histogram) calculation
- Radiomics feature extraction
- Quality control reports
- Results aggregation

---

## Prerequisites

1. **Part 1 Complete**: Run `rtpipeline_colab_part1_gpu.ipynb` first
2. **Runtime**: CPU runtime is sufficient (no GPU needed)
3. **Time**: ~2-5 min per patient for full analysis

---

## 1. Mount Google Drive and Load Configuration

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Google Drive mounted.")

In [None]:
#@title ### Load Configuration from Part 1 { display-mode: "form" }
#@markdown **Configuration file location** (created by Part 1):
CONFIG_PATH = "/content/drive/MyDrive/RTpipeline/rtpipeline_config.yaml"  #@param {type:"string"}

import yaml
import os
from pathlib import Path

# Load configuration
if os.path.exists(CONFIG_PATH):
    with open(CONFIG_PATH, 'r') as f:
        config = yaml.safe_load(f)
    
    # Extract key paths
    DICOM_INPUT = config['dicom_root']
    OUTPUT_DIR = config['output_dir']
    LOGS_DIR = config['logs_dir']
    
    print("="*60)
    print("CONFIGURATION LOADED FROM PART 1")
    print("="*60)
    print(f"\nInput:  {DICOM_INPUT}")
    print(f"Output: {OUTPUT_DIR}")
    print(f"Logs:   {LOGS_DIR}")
    print(f"\nRegion: {config.get('ct_cropping', {}).get('region', 'N/A')}")
    print(f"CT Cropping: {config.get('ct_cropping', {}).get('enabled', False)}")
    print(f"Robustness: {config.get('radiomics_robustness', {}).get('enabled', False)}")
    
    # Verify Part 1 outputs exist
    output_path = Path(OUTPUT_DIR)
    if output_path.exists():
        patients = [d for d in output_path.iterdir() if d.is_dir() and not d.name.startswith('_')]
        seg_count = sum(1 for p in patients for c in p.iterdir() 
                        if c.is_dir() and (c / 'Segmentation_TotalSegmentator').exists())
        print(f"\nPart 1 Status:")
        print(f"  Patients: {len(patients)}")
        print(f"  Segmented courses: {seg_count}")
        
        if seg_count > 0:
            print("\n✅ Part 1 data found. Ready to continue.")
        else:
            print("\n⚠️  WARNING: No segmentations found!")
            print("   Please run Part 1 first.")
    else:
        print(f"\n❌ Output directory not found: {OUTPUT_DIR}")
        print("   Please run Part 1 first.")
else:
    print(f"❌ Configuration file not found: {CONFIG_PATH}")
    print("\nPlease run Part 1 first to generate the configuration.")
    print("Or update the CONFIG_PATH above to point to your config file.")

---
## 2. Install Dependencies

In [None]:
%%bash
# Install Miniconda (if not already installed)
if [ ! -d "/content/miniconda" ]; then
    echo "Installing Miniconda..."
    wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
    bash miniconda.sh -b -p /content/miniconda
    rm miniconda.sh
    echo "Miniconda installed."
else
    echo "Miniconda already installed."
fi

export PATH="/content/miniconda/bin:$PATH"

# Install mamba
if ! command -v mamba &> /dev/null; then
    echo "Installing mamba..."
    conda install -y -c conda-forge mamba
fi

echo "Done."

In [None]:
import os
os.environ['PATH'] = '/content/miniconda/bin:' + os.environ['PATH']

In [None]:
%%bash
export PATH="/content/miniconda/bin:$PATH"

# Clone/update rtpipeline
if [ ! -d "/content/rtpipeline" ]; then
    echo "Cloning rtpipeline..."
    git clone https://github.com/kstawiski/rtpipeline.git /content/rtpipeline
else
    echo "Updating rtpipeline..."
    cd /content/rtpipeline && git pull
fi

In [None]:
%%bash
export PATH="/content/miniconda/bin:$PATH"

# Create rtpipeline-radiomics environment (PyRadiomics requires NumPy 1.x)
if ! conda env list | grep -q "rtpipeline-radiomics"; then
    echo "Creating rtpipeline-radiomics environment (this takes ~10 minutes)..."
    mamba env create -f /content/rtpipeline/envs/rtpipeline-radiomics.yaml
    echo "Environment created."
else
    echo "rtpipeline-radiomics environment already exists."
fi

# Also ensure main environment exists for DVH
if ! conda env list | grep -q "^rtpipeline "; then
    echo "Creating rtpipeline environment..."
    mamba env create -f /content/rtpipeline/envs/rtpipeline.yaml
fi

# Install rtpipeline package in both environments
source /content/miniconda/etc/profile.d/conda.sh

conda activate rtpipeline
pip install -e /content/rtpipeline 2>/dev/null || true

conda activate rtpipeline-radiomics
pip install -e /content/rtpipeline 2>/dev/null || true

echo "\nEnvironments ready!"

---
## 3. Copy Configuration

In [None]:
import shutil

# Copy configuration to local rtpipeline directory
local_config = '/content/rtpipeline/config.colab.yaml'
shutil.copy(CONFIG_PATH, local_config)

print(f"Configuration copied to: {local_config}")
print(f"\nReady to run analysis pipeline.")

---
## 4. Run DVH Calculation

In [None]:
%%bash
export PATH="/content/miniconda/bin:$PATH"
source /content/miniconda/etc/profile.d/conda.sh
conda activate rtpipeline

cd /content/rtpipeline

echo "================================================"
echo "Running DVH Calculation"
echo "================================================"

snakemake \
    --cores 2 \
    --configfile config.colab.yaml \
    --until all_dvh \
    --rerun-incomplete \
    2>&1 | tee -a "$(cat config.colab.yaml | grep logs_dir | cut -d':' -f2 | tr -d ' ')/part2_dvh.log"

echo "\n✅ DVH calculation complete!"

---
## 5. Run Radiomics Extraction

In [None]:
%%bash
export PATH="/content/miniconda/bin:$PATH"
source /content/miniconda/etc/profile.d/conda.sh
conda activate rtpipeline-radiomics

cd /content/rtpipeline

echo "================================================"
echo "Running Radiomics Extraction"
echo "================================================"
echo "(This may take several minutes per patient)"

snakemake \
    --cores 2 \
    --configfile config.colab.yaml \
    --until all_radiomics \
    --rerun-incomplete \
    2>&1 | tee -a "$(cat config.colab.yaml | grep logs_dir | cut -d':' -f2 | tr -d ' ')/part2_radiomics.log"

echo "\n✅ Radiomics extraction complete!"

---
## 6. Run Quality Control

In [None]:
%%bash
export PATH="/content/miniconda/bin:$PATH"
source /content/miniconda/etc/profile.d/conda.sh
conda activate rtpipeline

cd /content/rtpipeline

echo "================================================"
echo "Running Quality Control"
echo "================================================"

snakemake \
    --cores 2 \
    --configfile config.colab.yaml \
    --until all_qc \
    --rerun-incomplete \
    2>&1 | tee -a "$(cat config.colab.yaml | grep logs_dir | cut -d':' -f2 | tr -d ' ')/part2_qc.log"

echo "\n✅ Quality control complete!"

---
## 7. Aggregate All Results

In [None]:
%%bash
export PATH="/content/miniconda/bin:$PATH"
source /content/miniconda/etc/profile.d/conda.sh
conda activate rtpipeline

cd /content/rtpipeline

echo "================================================"
echo "Aggregating All Results"
echo "================================================"

snakemake \
    --cores 2 \
    --configfile config.colab.yaml \
    all \
    --rerun-incomplete \
    2>&1 | tee -a "$(cat config.colab.yaml | grep logs_dir | cut -d':' -f2 | tr -d ' ')/part2_aggregate.log"

echo "\n✅ Aggregation complete!"

---
## 8. View Results

In [None]:
import pandas as pd
from pathlib import Path

results_dir = Path(OUTPUT_DIR) / '_RESULTS'

print("="*60)
print("PIPELINE RESULTS")
print("="*60)

if results_dir.exists():
    print(f"\nResults directory: {results_dir}")
    print("\nAvailable files:")
    for f in sorted(results_dir.glob('*.xlsx')):
        size_mb = f.stat().st_size / 1e6
        print(f"  - {f.name} ({size_mb:.2f} MB)")
else:
    print(f"Results directory not found: {results_dir}")

In [None]:
# Load and preview DVH metrics
dvh_file = Path(OUTPUT_DIR) / '_RESULTS' / 'dvh_metrics.xlsx'

if dvh_file.exists():
    dvh = pd.read_excel(dvh_file)
    print("DVH METRICS")
    print("="*40)
    print(f"Rows: {len(dvh)}")
    print(f"Structures: {dvh['Structure'].nunique()}")
    print(f"Patients: {dvh['PatientID'].nunique()}")
    print(f"\nColumns: {list(dvh.columns)[:10]}...")
    print("\nSample data:")
    display(dvh.head(10))
else:
    print("DVH metrics file not found.")

In [None]:
# Load and preview radiomics features
radiomics_file = Path(OUTPUT_DIR) / '_RESULTS' / 'radiomics_ct.xlsx'

if radiomics_file.exists():
    rad = pd.read_excel(radiomics_file)
    print("RADIOMICS FEATURES")
    print("="*40)
    print(f"Rows: {len(rad)}")
    print(f"Features: {len(rad.columns)}")
    
    if 'Structure' in rad.columns:
        print(f"Structures: {rad['Structure'].nunique()}")
    
    # Count feature types
    original = len([c for c in rad.columns if c.startswith('original_')])
    wavelet = len([c for c in rad.columns if c.startswith('wavelet')])
    log = len([c for c in rad.columns if c.startswith('log-sigma')])
    print(f"\nFeature breakdown:")
    print(f"  Original: {original}")
    print(f"  Wavelet: {wavelet}")
    print(f"  LoG: {log}")
else:
    print("Radiomics file not found.")

In [None]:
# Check QC summary
qc_file = Path(OUTPUT_DIR) / '_RESULTS' / 'qc_reports.xlsx'

if qc_file.exists():
    qc = pd.read_excel(qc_file)
    print("QUALITY CONTROL SUMMARY")
    print("="*40)
    
    if 'Overall_Status' in qc.columns:
        status_counts = qc['Overall_Status'].value_counts()
        print(f"\nStatus breakdown:")
        for status, count in status_counts.items():
            print(f"  {status}: {count}")
    
    print("\nSample data:")
    display(qc.head())
else:
    print("QC file not found.")

---
## 9. Download Results

Results are saved in your Google Drive at the output directory.

You can also download specific files directly:

In [None]:
# Optional: Download results to your local machine
from google.colab import files

results_dir = Path(OUTPUT_DIR) / '_RESULTS'

print("To download files, uncomment the lines below and run this cell.")
print(f"\nResults are already saved in Google Drive at:")
print(f"  {results_dir}")

# Uncomment to download specific files:
# files.download(str(results_dir / 'dvh_metrics.xlsx'))
# files.download(str(results_dir / 'radiomics_ct.xlsx'))
# files.download(str(results_dir / 'qc_reports.xlsx'))
# files.download(str(results_dir / 'case_metadata.xlsx'))

---
## 10. Summary

In [None]:
from pathlib import Path

print("="*60)
print("RTPIPELINE PROCESSING COMPLETE")
print("="*60)

output_path = Path(OUTPUT_DIR)
results_path = output_path / '_RESULTS'

# Count outputs
patients = [d for d in output_path.iterdir() if d.is_dir() and not d.name.startswith('_')]
courses = sum(1 for p in patients for c in p.iterdir() if c.is_dir())

print(f"\nProcessed:")
print(f"  Patients: {len(patients)}")
print(f"  Treatment courses: {courses}")

print(f"\nResults location:")
print(f"  {results_path}")

print(f"\nKey output files:")
if results_path.exists():
    for f in sorted(results_path.glob('*.xlsx')):
        print(f"  - {f.name}")

print(f"\nConfiguration used:")
print(f"  {CONFIG_PATH}")

print(f"\nLogs location:")
print(f"  {LOGS_DIR}")

print("\n" + "="*60)
print("Thank you for using RTpipeline!")
print("="*60)