# TCGA-BRCA Data Exploration
This notebook visualizes the dataset statistics, survival curves, and sample image patches.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import numpy as np

# Config
DATA_DIR = Path("../data")
CLINICAL_PATH = DATA_DIR / "processed/clinical_processed.csv"
PATCHES_DIR = DATA_DIR / "processed/patches"



## 1. Clinical Data Analysis


In [None]:
# Load Data
df = pd.read_csv(CLINICAL_PATH)
print(f"Total Patients: {len(df)}")
df.head()



In [None]:
# Survival Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['overall_survival_months'].dropna(), bins=30, kde=True)
plt.title("Overall Survival Distribution (Months)")
plt.xlabel("Months")
plt.show()



In [None]:
# Tumor Stage Distribution
plt.figure(figsize=(8, 4))
sns.countplot(y=df['stage_numeric'], order=sorted(df['stage_numeric'].unique()))
plt.title("Tumor Stage Count")
plt.show()



## 2. Image Patch Visualization
Let's visualize random patches from the processed slides.


In [None]:
# Find all patches
slides = [d for d in PATCHES_DIR.iterdir() if d.is_dir()]
print(f"Found {len(slides)} slides.")

# Collect sample patches
sample_patches = []
for slide in slides[:3]: # Look at first 3 slides
    patches = list(slide.glob("*.png"))
    if patches:
        # Pick 3 random
        sample_patches.extend(np.random.choice(patches, min(3, len(patches)), replace=False))

# Plot
if sample_patches:
    plt.figure(figsize=(15, 5))
    for i, p in enumerate(sample_patches):
        if i >= 5: break
        plt.subplot(1, 5, i+1)
        img = Image.open(p)
        plt.imshow(img)
        plt.axis('off')
        plt.title(p.parent.name[:10]+"...")
    plt.show()
else:
    print("No patches found yet.")

