## Add training lables to CLAM Preprocessing 


This notebook evaluates the dataset & create train labels.

datasets/brca-psj-path//contest-phase-2/csv-train/ 

- `cancer-dx.csv`  
- `comorbidities.csv`
- `demographics.csv`
- `outcomes.csv`
- `pathology-items.csv`
- `social-deteriminants.csv`
- `treatments.csv`

data dictionary:
https://docs.ngsci.org/datasets/brca-psj-path/data-dictionary.html


In [6]:
import os
from pathlib import Path
from openslide import OpenSlide
from PIL import Image
import matplotlib.pyplot as plt
import h5py
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.models import resnet18, alexnet
import pytorch_lightning as pl
from tqdm import tqdm
import pandas as pd

#torch.cuda.init()
# assert torch.cuda.is_initialized()
#print(torch.cuda.get_device_properties(0))

BRCA_ROOT = Path().home() / "datasets" / "brca-psj-path"   #Path().home() home direc of current user,typically /home/<user>
IMAGE_ROOT = BRCA_ROOT / "contest-phase-2" / "png-downsampled-train"
#TABLE_DIR = BRCA_ROOT / "v2"
TABLE_DIR = BRCA_ROOT / "contest-phase-2" / "csv-train"
LOGGER_DIR = Path().home() / "logs"

In [3]:
brca_dir = Path().home() / 'datasets' / 'brca-psj-path'
ndpi_dir = brca_dir / 'ndpi'
clam_train_dir = brca_dir / 'contest-phase-2' / 'clam-preprocessing-train'

masks_dir = clam_train_dir / 'masks'
patches_dir = clam_train_dir / 'patches'
stitches_dir = clam_train_dir / 'stitches'
features_h5_dir = clam_train_dir / 'resnet50-features'/ 'h5_files'
features_pt_dir = clam_train_dir / 'resnet50-features'/ 'pt_files'

slide_id = '0000459a-838d-4865-8bbf-ea66f2e5ee4d'
ndpi_fp = ndpi_dir / f'{slide_id}.ndpi'

In [14]:
tables = dict()

for csv_fp in TABLE_DIR.glob("*.csv"):
    csv_name = csv_fp.name
    df = pd.read_csv(csv_fp)
    tables[csv_name] = df

# The outcomes table is of interests for most of the summary stats
outcomes_df = tables["outcomes.csv"]
cancerdx_df = tables["cancer-dx.csv"]
demographics_df = tables["demographics.csv"]
comorbidities_df = tables["comorbidities.csv"]
pathology_df = tables["pathology-items.csv"]
treatments_df = tables["treatments.csv"]

# Create a column with just the base stage number
outcomes_df["stage_number"] = outcomes_df["stage"].str.replace(
    "(A|B|C)", "", regex=True
)

tables.keys()

dict_keys(['outcomes.csv', 'cancer-dx.csv', 'demographics.csv', 'comorbidities.csv', 'pathology-items.csv', 'social-determinants.csv', 'treatments.csv'])

In [23]:
outcomes_df.groupby(['stage','mortality']).size()

stage  mortality
0      0            174
       1              6
IA     0            317
       1             18
IB     0             51
       1              4
IIA    0            110
       1             26
IIB    0             49
       1             15
IIIA   0             64
       1             25
IIIB   0             22
       1              8
IIIC   0             29
       1             22
IV     0             29
       1             31
dtype: int64

In [19]:
outcomes_df.columns

Index(['biopsy_id', 'patient_ngsci_id', 'case_year', 'biopsy_dt', 'mortality',
       'death_dt', 'in_registry', 'stage', 'strict_metastatic_dx',
       'strict_metastatic_dx_dt', 'stage_number'],
      dtype='object')

waht is `strict_metastatic_dx`?

In [26]:
treatments_df.tail(3)

Unnamed: 0,biopsy_id,cancer_registry_dx_dt,most_definitive_surgical_procedure_cd,most_definitive_radiation_modality_cd,surgical_margin_cd,radiation_summ_cd,chemo_summ_cd,immuno_therapy_cd,hormone_summ_cd,rx_dx_stg_proc_dt,rx_mst_defn_srg_dt,first_surgery_dt,radiation_start_dt,rx_chemo_dt,rx_hormone_dt,stg_dx_summ_cd
997,8881a5ec-5a09-4079-9ee6-616eac6b97f8,2115-04-14,45.0,25.0,0.0,1.0,3.0,0.0,1.0,2115-04-14,2115-05-11,2115-05-11,2115-11-03,2115-06-14,2115-12-29,2.0
998,febec13d-fd24-4cff-a079-2dd7140ee4cd,2116-03-18,23.0,32.0,0.0,1.0,0.0,0.0,1.0,2116-03-27,2116-05-30,2116-05-02,2116-06-20,,2116-09-01,2.0
999,2b62e9ee-86f9-4855-82b5-ac7c634487c8,2115-06-14,22.0,31.0,0.0,1.0,0.0,0.0,82.0,2115-06-14,2115-08-02,2115-08-02,2115-09-06,,,2.0
