# Generate Data Splits (npz files)

This notebook show the code of generating data splits and saving them as `npz` files.

`TCGA-BRCA` will be taken as the example for illustration.

In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
os.chdir('..')
os.getcwd()

'/home/liup/repo/PseMix'

## 1. loading the csv file with data label information.

In [2]:
NAME_DATASET = 'TCGA_BRCA'
PATH_LABEL_CSV = './data_split/{}/{}_path_full_subtype.csv'.format(NAME_DATASET.lower(), NAME_DATASET)

data_csv = pd.read_csv(PATH_LABEL_CSV)
data_csv = data_csv.loc[:, ['patient_id', 'pathology_id', 'subtype', 'label']]
#data_csv['patient_id'] = data_csv['pathology_id'].apply(lambda s: s[:12])
#data_csv['label'] = data_csv['subtype'].apply(lambda s: 1 if s == 'LUAD' else 0) # 1 if s == 'LUAD' else 0
data_csv.head()

Unnamed: 0,patient_id,pathology_id,subtype,label
0,TCGA-D8-A1XL,TCGA-D8-A1XL-01Z-00-DX2.FDE2C80D-5DC4-4743-A18...,IDC,1
1,TCGA-D8-A1XW,TCGA-D8-A1XW-01Z-00-DX1.10187A1F-086B-4CD9-AC0...,IDC,1
2,TCGA-C8-A12Y,TCGA-C8-A12Y-01Z-00-DX1.A15CB3E2-E145-4C75-8FE...,IDC,1
3,TCGA-GM-A2DM,TCGA-GM-A2DM-01Z-00-DX1.652038F4-C370-40EB-A54...,IDC,1
4,TCGA-AR-A24R,TCGA-AR-A24R-01Z-00-DX1.47D79205-63E7-43E6-A51...,IDC,1


In [3]:
gps = data_csv.groupby('patient_id')
for k, v in gps:
    if len(v.index) > 1:
        for i in range(len(v.index)):
            if v.iloc[i, 3] != v.iloc[0, 3]:
                print('The patient {} has two subtype-different slides'.format(k))
print("There are {} WSIs".format(len(data_csv)))
print("There are {} patients".format(len(gps)))

There are 953 WSIs
There are 898 patients


get patient-level information for **patient-level splitting**

In [4]:
data_pat = data_csv.drop_duplicates(subset=['patient_id'], keep='first').loc[:, ['patient_id', 'subtype', 'label']]
data_pat = data_pat.reset_index(drop=True)
data_pat.head()

Unnamed: 0,patient_id,subtype,label
0,TCGA-D8-A1XL,IDC,1
1,TCGA-D8-A1XW,IDC,1
2,TCGA-C8-A12Y,IDC,1
3,TCGA-GM-A2DM,IDC,1
4,TCGA-AR-A24R,IDC,1


## Generating and Saveing the `npz` data splits (Binary Class)

We randomly split the patients into three subsets (train/val/test) with a ratio of 60/15/25. 

The split will be saved as `npz` files like the following:

```python
PATH_TO_SAVE = './data_split_xx.npz'
pids_new_train = ['train_001', ..., 'train_100'] # each string should correspond to the patient IDs for training
pids_new_val = ['val_001', ..., 'val_100'] # each string should correspond to the patient IDs for validation
pids_new_test = ['test_01', ..., 'test_100'] # each string should correspond to the patient IDs for test
np.savez(
    PATH_TO_SAVE,
    train_patients=pids_new_train,
    val_patients=pids_new_val,
    test_patients=pids_test
)
```

In [5]:
DIR_TO_SAVE = './data_split/{}'.format(NAME_DATASET.lower())

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

SEED = 42
skf = StratifiedKFold(n_splits=4, random_state=SEED, shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(data_pat['patient_id'], data_pat['label'])):
    print("{}-th fold:".format(i+1))
    pat_train, y_train = data_pat['patient_id'][train_index], data_pat['label'][train_index]
    pat_test,  y_test  = data_pat['patient_id'][test_index],  data_pat['label'][test_index]
    
    # further split train into train/val
    pat_train = pat_train.reset_index(drop=True)
    y_train   = y_train.reset_index(drop=True)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    for j, (new_train_index, new_val_index) in enumerate(sss.split(pat_train, y_train)):
        print("\tfurther split into train/val".format(j+1))
        pat_new_train, y_new_train = pat_train[new_train_index], y_train[new_train_index]
        pat_new_val,   y_new_val   = pat_train[new_val_index], y_train[new_val_index]
    
    # stats of splits
    print("\t# train/val/test: {}/{}/{}".format(len(pat_new_train), len(pat_new_val), len(pat_test)))
    
    PATH_TO_NPZ = osp.join(DIR_TO_SAVE, f'{NAME_DATASET}-fold{i}.npz')
    # save the splits
    np.savez(PATH_TO_NPZ, 
        train_patients=list(pat_new_train), 
        val_patients=list(pat_new_val), 
        test_patients=list(pat_test)
    )
    print(f"\t[info] see the npz file at {PATH_TO_NPZ}.")

1-th fold:
	further split into train/val
	# train/val/test: 538/135/225
	[info] see the npz file at ./data_split/tcga_brca/TCGA_BRCA-kfold0.npz.
2-th fold:
	further split into train/val
	# train/val/test: 538/135/225
	[info] see the npz file at ./data_split/tcga_brca/TCGA_BRCA-kfold1.npz.
3-th fold:
	further split into train/val
	# train/val/test: 539/135/224
	[info] see the npz file at ./data_split/tcga_brca/TCGA_BRCA-kfold2.npz.
4-th fold:
	further split into train/val
	# train/val/test: 539/135/224
	[info] see the npz file at ./data_split/tcga_brca/TCGA_BRCA-kfold3.npz.


## Generating and Saveing the data splits (Multi-Class)

In [None]:
DIR_TO_SAVE = './data_split/{}'.format(NAME_DATASET.lower())

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

NUM_CLASS = 3
SEED = 42
skf = StratifiedKFold(n_splits=4, random_state=SEED, shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(data_pat['patient_id'], data_pat['label'])):
    print("{}-th fold:".format(i+1))
    pat_train, y_train = data_pat['patient_id'][train_index], data_pat['label'][train_index]
    pat_test,  y_test  = data_pat['patient_id'][test_index],  data_pat['label'][test_index]
    
    # further split train into train/val
    pat_train = pat_train.reset_index(drop=True)
    y_train   = y_train.reset_index(drop=True)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    for j, (new_train_index, new_val_index) in enumerate(sss.split(pat_train, y_train)):
        print("\tfurther split into train/val".format(j+1))
        pat_new_train, y_new_train = pat_train[new_train_index], y_train[new_train_index]
        pat_new_val,   y_new_val   = pat_train[new_val_index], y_train[new_val_index]
    
    # stats of splits
    print("\t# train/val/test: {}/{}/{}".format(len(pat_new_train), len(pat_new_val), len(pat_test)))
    for i_cls in range(NUM_CLASS):
        print("\t[{}-th class] # patients = {}/{}/{} in train/val/test".\
              format(i_cls, (y_new_train == i_cls).sum(), (y_new_val == i_cls).sum(), (y_test == i_cls).sum()))
    
    PATH_TO_NPZ = osp.join(DIR_TO_SAVE, f'{NAME_DATASET}-fold{i}.npz')
    # save the splits
    np.savez(PATH_TO_NPZ, 
        train_patients=list(pat_new_train), 
        val_patients=list(pat_new_val), 
        test_patients=list(pat_test)
    )
    print(f"\t[info] see the npz file at {PATH_TO_NPZ}.")