# GDC Data Inventory - BRCA Phase 1

This notebook inventories **ALL** data available in GDC for the BRCA paired cohort.

**Input:** `data/processed/brca_subtyping/brca_paired_cohort.csv`

**Output:** Complete inventory saved to `data/inventory/brca_phase1/`

**Date:** February 2026

## 1. Setup and Configuration

In [2]:
import pandas as pd
import requests
import json
from collections import defaultdict, Counter
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# GDC API endpoint
CASES_ENDPT = 'https://api.gdc.cancer.gov/cases'

print("‚úì Imports successful")
print(f"‚úì Pandas version: {pd.__version__}")

‚úì Imports successful
‚úì Pandas version: 2.3.3


## 2. Explore Available Input Files

In [6]:
import os

# Check current working directory
print("Current working directory:")
print(os.getcwd())
print("\n" + "=" * 80)

# List what's actually in the current directory
print("\nContents of current directory:")
for item in sorted(Path('.').iterdir()):
    if item.is_dir():
        print(f"  üìÅ {item.name}/")
    else:
        print(f"  üìÑ {item.name}")

Current working directory:
d:\Projects\histo-to-omics-framework\notebooks\brca_subtyping


Contents of current directory:
  üìÑ 01_brca_risk_data_inventory.ipynb
  üìÑ 01_tcga_rnaseq_pam50_prelim.ipynb
  üìÑ 02_brca_phase1_progress_for_stakeholders.html
  üìÑ 02_brca_phase1_progress_for_stakeholders.ipynb
  üìÑ A_feature_sanity_check.ipynb
  üìÑ B_slide_level_aggregation_sanity_check.ipynb
  üìÑ gdc_inventory_brca.ipynb
  üìÅ outputs/


In [7]:
# Search for ALL CSV files related to BRCA
print("=" * 80)
print("SEARCHING FOR BRCA CSV FILES...")
print("=" * 80)

# Start from current directory and go up to find data folder
current = Path.cwd()

# Try to find 'data' folder
data_locations = [
    Path('.') / 'data',
    Path('..') / 'data',
    Path('../..') / 'data',
    current / 'data',
    current.parent / 'data',
    current.parent.parent / 'data',
]

data_dir = None
for loc in data_locations:
    if loc.exists() and loc.is_dir():
        data_dir = loc.resolve()
        print(f"‚úì Found data directory: {data_dir}\n")
        break

if data_dir:
    # Find all BRCA-related CSV files
    brca_files = list(data_dir.rglob('*brca*.csv'))
    
    print(f"Found {len(brca_files)} BRCA CSV files:\n")
    for f in sorted(brca_files):
        rel_path = f.relative_to(data_dir.parent)
        size_mb = f.stat().st_size / (1024**2)
        print(f"  {rel_path}")
        print(f"    Size: {size_mb:.2f} MB")
        print()
else:
    print("‚úó Could not locate 'data' directory")
    print("\nManual search from root...")

SEARCHING FOR BRCA CSV FILES...
‚úì Found data directory: D:\Projects\histo-to-omics-framework\data

Found 5 BRCA CSV files:

  data\processed\brca_subtyping\brca_expr_counts.csv
    Size: 173.91 MB

  data\processed\brca_subtyping\brca_expr_counts_log2cpm.csv
    Size: 742.97 MB

  data\processed\brca_subtyping\brca_expr_counts_preprocessed.csv
    Size: 162.19 MB

  data\processed\brca_subtyping\brca_expr_samples.csv
    Size: 0.24 MB

  data\processed\brca_subtyping\brca_paired_cohort.csv
    Size: 0.29 MB



In [8]:
# Preview the two small files that likely contain case information
print("=" * 80)
print("PREVIEWING CANDIDATE INPUT FILES")
print("=" * 80)

# These two are small and likely have case IDs
candidate_files = [
    'data/processed/brca_subtyping/brca_expr_samples.csv',
    'data/processed/brca_subtyping/brca_paired_cohort.csv'
]

for file_path in candidate_files:
    print(f"\n{'‚îÄ' * 80}")
    print(f"FILE: {file_path}")
    print(f"{'‚îÄ' * 80}")
    
    df = pd.read_csv(file_path)
    print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
    print(f"Columns: {list(df.columns)}")
    print(f"\nFirst 5 rows:")
    print(df.head())
    print(f"\nColumn details:")
    print(df.info())

PREVIEWING CANDIDATE INPUT FILES

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
FILE: data/processed/brca_subtyping/brca_expr_samples.csv
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/brca_subtyping/brca_expr_samples.csv'

In [9]:
# Also check outputs/ folder for any labels/results files
print("\n" + "=" * 80)
print("CHECKING outputs/ FOLDER")
print("=" * 80)

outputs_dir = Path('outputs')
if outputs_dir.exists():
    brca_outputs = list(outputs_dir.rglob('*brca*.csv'))
    print(f"Found {len(brca_outputs)} BRCA files in outputs/:\n")
    for f in sorted(brca_outputs):
        print(f"  {f}")
        # Quick preview
        df = pd.read_csv(f, nrows=2)
        print(f"    Shape: {df.shape}, Columns: {list(df.columns)[:5]}...")
        print()
else:
    print("outputs/ folder not found in current directory")


CHECKING outputs/ FOLDER
Found 0 BRCA files in outputs/:



In [10]:
# Fix the paths - go up to project root
project_root = Path('..').resolve() / '..'  # Go up from notebooks/brca_subtyping to root
processed_dir = project_root / 'data' / 'processed' / 'brca_subtyping'

print(f"Project root: {project_root.resolve()}")
print(f"Processed dir: {processed_dir}")
print(f"Processed dir exists: {processed_dir.exists()}")
print()

if processed_dir.exists():
    print("Files in processed directory:")
    for f in sorted(processed_dir.glob('*.csv')):
        print(f"  ‚úì {f.name}")

Project root: D:\Projects\histo-to-omics-framework
Processed dir: D:\Projects\histo-to-omics-framework\notebooks\..\data\processed\brca_subtyping
Processed dir exists: True

Files in processed directory:
  ‚úì brca_expr_counts.csv
  ‚úì brca_expr_counts_log2cpm.csv
  ‚úì brca_expr_counts_preprocessed.csv
  ‚úì brca_expr_samples.csv
  ‚úì brca_paired_cohort.csv


In [11]:
# Now preview the files with correct paths
file1 = processed_dir / 'brca_expr_samples.csv'
file2 = processed_dir / 'brca_paired_cohort.csv'

print("=" * 80)
print(f"FILE 1: brca_expr_samples.csv")
print("=" * 80)

if file1.exists():
    df1 = pd.read_csv(file1)
    print(f"Shape: {df1.shape}")
    print(f"Columns: {list(df1.columns)}")
    print(f"\nFirst 5 rows:")
    display(df1.head())
else:
    print(f"‚úó File not found: {file1}")

FILE 1: brca_expr_samples.csv
Shape: (1095, 3)
Columns: ['case_id', 'file_uuid', 'tsv_path']

First 5 rows:


Unnamed: 0,case_id,file_uuid,tsv_path
0,6a186809-3422-41d0-83d2-867145830936,0019c951-16c5-48d0-85c8-58d96b12d330,data\raw\gdc\brca_phase1\rnaseq_star_counts\00...
1,c2a742fe-3e8b-4210-85a6-7191a1123609,0022cd20-f64f-4773-b9ff-a3de0b71b259,data\raw\gdc\brca_phase1\rnaseq_star_counts\00...
2,5b2a4f11-ca46-4974-9420-59b4820920bf,00469928-b243-4cae-acd7-134508e99ceb,data\raw\gdc\brca_phase1\rnaseq_star_counts\00...
3,23b7aaea-1119-4b10-aa1a-0ae255d2f2a6,0081f507-b104-4214-9ea1-31dd69130991,data\raw\gdc\brca_phase1\rnaseq_star_counts\00...
4,4922cddc-575c-4b8a-8245-ce5f6876760c,0094f9d0-45ec-4aad-bca0-71c60bdd7113,data\raw\gdc\brca_phase1\rnaseq_star_counts\00...


In [12]:
print("=" * 80)
print(f"FILE 2: brca_paired_cohort.csv")
print("=" * 80)

if file2.exists():
    df2 = pd.read_csv(file2)
    print(f"Shape: {df2.shape}")
    print(f"Columns: {list(df2.columns)}")
    print(f"\nFirst 5 rows:")
    display(df2.head())
else:
    print(f"‚úó File not found: {file2}")

FILE 2: brca_paired_cohort.csv
Shape: (1098, 9)
Columns: ['case_id', 'submitter_id', 'has_rnaseq', 'has_wsi', 'rnaseq_file_id', 'rnaseq_file_name', 'wsi_slide_count', 'example_wsi_file_id', 'example_wsi_file_name']

First 5 rows:


Unnamed: 0,case_id,submitter_id,has_rnaseq,has_wsi,rnaseq_file_id,rnaseq_file_name,wsi_slide_count,example_wsi_file_id,example_wsi_file_name
0,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,True,True,41e79241-b5a4-4541-848b-e20e693e8ee3,22c2b380-799e-4fad-ae38-46a916c592d5.rna_seq.a...,8,8be4eefd-d367-4757-8ece-b581b3fac2d2,TCGA-E2-A1IU-11A-02-TSB.b1e1d4f0-7a97-4a67-a15...
1,0045349c-69d9-4306-a403-c9c1fa836644,TCGA-A1-A0SB,True,True,0e0df72c-33c0-4e4f-939c-a4d45a6e1ea3,36125e17-48fd-4eea-874c-ed2e2e218402.rna_seq.a...,3,cea82b7d-135a-49d5-b4f6-3fb0215f7188,TCGA-A1-A0SB-01Z-00-DX1.B34C267B-CAAA-4AB6-AD5...
2,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,TCGA-A2-A04W,True,True,94468afc-faea-4091-af0f-d2df8aec37da,0781a18f-ce2a-478f-945e-49b2b8d7d941.rna_seq.a...,3,3a9802b4-90f6-427e-9f28-742a2952e04b,TCGA-A2-A04W-01A-03-TSC.83981bb3-9dfd-4632-91c...
3,00a2d166-78c9-4687-a195-3d6315c27574,TCGA-AN-A0AM,True,True,e14858ca-8bb5-4d6d-906c-5d62722d90f8,fd6f4c9b-ee43-4939-8cfa-2e447aedbcf3.rna_seq.a...,3,1eab7934-10a1-4d44-abb5-ee7c969de28f,TCGA-AN-A0AM-01A-01-TSA.63a22828-01b7-4b90-976...
4,00b11ca8-8540-4a3d-b602-ec754b00230b,TCGA-LL-A440,True,True,e6f927ef-840f-438a-9559-863cc64f71e0,98447eef-e5af-4bfa-a215-7a1edd601c46.rna_seq.a...,2,2c9c1bbe-dc6e-44ca-848e-3eb33ed5f914,TCGA-LL-A440-01A-01-TSA.7F50766F-CBFA-40A8-8B9...


In [13]:
# Search for the WSI PAM50 labels file
print("=" * 80)
print("SEARCHING FOR: brca_wsi_pam50_case_labels.csv")
print("=" * 80)

# Check multiple possible locations
possible_locations = [
    processed_dir / 'brca_wsi_pam50_case_labels.csv',
    project_root / 'outputs' / 'brca_wsi_pam50_case_labels.csv',
    project_root / 'data' / 'processed' / 'brca_wsi_pam50_case_labels.csv',
]

wsi_pam50_file = None
for loc in possible_locations:
    if loc.exists():
        wsi_pam50_file = loc
        print(f"‚úì FOUND: {loc}\n")
        break

if wsi_pam50_file:
    df_wsi = pd.read_csv(wsi_pam50_file)
    print(f"Shape: {df_wsi.shape}")
    print(f"Columns: {list(df_wsi.columns)}")
    print(f"\nFirst 5 rows:")
    display(df_wsi.head())
    
    print(f"\n{'‚îÄ' * 80}")
    print("COMPARISON:")
    print(f"{'‚îÄ' * 80}")
    print(f"brca_paired_cohort.csv:          {len(cohort_df)} cases")
    print(f"brca_wsi_pam50_case_labels.csv:  {len(df_wsi)} cases")
    
else:
    print("‚úó File not found in standard locations")
    print("\nSearching entire project...")
    found_files = list(project_root.rglob('*wsi*pam50*.csv'))
    if found_files:
        for f in found_files:
            print(f"  Found: {f.relative_to(project_root)}")
    else:
        print("  No matches found")

SEARCHING FOR: brca_wsi_pam50_case_labels.csv
‚úó File not found in standard locations

Searching entire project...
  Found: outputs\brca_subtyping\tables\brca_wsi_pam50_case_labels.csv
  Found: outputs\brca_subtyping\tables\brca_wsi_pam50_slide_labels.csv


In [14]:
# Load the WSI PAM50 case labels file
wsi_pam50_file = project_root / 'outputs' / 'brca_subtyping' / 'tables' / 'brca_wsi_pam50_case_labels.csv'

df_wsi = pd.read_csv(wsi_pam50_file)

print("=" * 80)
print("FILE: brca_wsi_pam50_case_labels.csv")
print("=" * 80)
print(f"Shape: {df_wsi.shape}")
print(f"Columns: {list(df_wsi.columns)}")
print(f"\nFirst 5 rows:")
display(df_wsi.head())

print(f"\n{'‚ïê' * 80}")
print("COHORT COMPARISON")
print(f"{'‚ïê' * 80}")
print(f"brca_paired_cohort.csv:          {len(cohort_df):,} cases")
print(f"brca_wsi_pam50_case_labels.csv:  {len(df_wsi):,} cases")
print(f"Difference:                       {len(cohort_df) - len(df_wsi):,} cases")

# Check if case IDs overlap
if 'case_id' in df_wsi.columns:
    overlap = set(cohort_df['case_id']) & set(df_wsi['case_id'])
    print(f"\nCase ID overlap:                  {len(overlap):,} cases")
    print(f"Only in paired_cohort:            {len(cohort_df) - len(overlap):,} cases")
    print(f"Only in wsi_pam50:                {len(df_wsi) - len(overlap):,} cases")

FILE: brca_wsi_pam50_case_labels.csv
Shape: (1095, 3)
Columns: ['case_id', 'pam50_subtype', 'n_wsi_slides']

First 5 rows:


Unnamed: 0,case_id,pam50_subtype,n_wsi_slides
0,001cef41-ff86-4d3f-a140-a647ac4b10a1,LumA,8
1,0045349c-69d9-4306-a403-c9c1fa836644,Normal,3
2,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,Her2,3
3,00a2d166-78c9-4687-a195-3d6315c27574,LumB,3
4,00b11ca8-8540-4a3d-b602-ec754b00230b,LumA,2



‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
COHORT COMPARISON
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê


NameError: name 'cohort_df' is not defined