# Data set H5 compression

Import necessary packages

In [2]:
import os 
import shutil 
import sys
import h5py
import time

import numpy as np
import pandas as pd

from skimage import transform

from tensorflow.keras import applications, layers, models

import matplotlib.pyplot as plt

In [3]:
sys.path.append('/home/jmalagont/Documentos/MLDS6project/src/nombre_paquete/preprocessing')
import pyWSI as pywsi

Important variables

In [26]:
# Paths
dataset_path = '/run/media/jmalagont/Thesis/Thesis/DataSet/TCGA-BRCA/'
h5_path = '/run/media/jmalagont/Thesis/Thesis/DataSet/TCGA-BRCA/TCGA-BRCA.h5'

# Images
back_bone = 'EfficientNetV2B0'
image_size = (256,256,3)

# Metadata
columns = ['bcr_patient_barcode','vital_status', 'days_to_last_followup', 'days_to_death']

## 1.) Create the H5 file




In [27]:
exist = os.path.exists(h5_path)

if exist == True:
  os.remove(h5_path)
  hf = h5py.File(h5_path, 'w')
  print('Replacing file ...')
  hf.close()
else:
  hf = h5py.File(h5_path, 'w')
  print('Creating file ...')
  hf.close()

Replacing file ...


## 2.) Add patient ID

In [28]:
hf = h5py.File(h5_path, 'r+')

pWSI_paths = os.listdir(dataset_path + 'pseudo-WSI/')
pWSI_names = [pWSI_path[:-4] for pWSI_path in pWSI_paths]

print(f'Founded {len(pWSI_names)} cases')

ID_dataset = hf.create_dataset("Patient ID", [len(pWSI_names)], dtype='S12')
ID_dataset[:] = pWSI_names


Founded 1062 cases


## 3.) Add pseudo-WSI CNN representation


## 4.) Add CNN representation


In [29]:
def compression_CNN (backbone, shape):
  base_model = eval(f'applications.{backbone}(include_top=False, weights="imagenet", input_shape={str(shape)})')

  cnn = models.Sequential()

  cnn.add(base_model)
  cnn.add(layers.GlobalAveragePooling2D())

  return(cnn)

In [30]:
cnn = compression_CNN (back_bone, image_size)

In [None]:
hf = h5py.File(h5_path, 'r+')

pWSI_embedding_dataset = hf.create_dataset("embedding", [len(ID_dataset), 1280], dtype='f')

for i in range(len(ID_dataset)):
  pWSI_path = dataset_path + 'pseudo-WSI/' + ID_dataset[i].decode('UTF-8') + '.npy'
  #print(ID_dataset[i].decode('UTF-8'))
  pWSI = np.load(pWSI_path)
  
  pWSI_assambly = pywsi.patch_assembly(pWSI, assambly_size=None) 
  #pWSI_assambly = transform.resize(pWSI_assambly, list(image_size)[:2], anti_aliasing=True)
  pWSI_CNN_embbeding = cnn(np.array([pWSI_assambly]))
  pWSI_embedding_dataset[i] = pWSI_CNN_embbeding.numpy()[0]

  sys.stdout.write(f'\r Charged {i+1} of {len(ID_dataset)}')

  time.sleep(10)
  del pWSI, pWSI_assambly

## 5.) Add Survival information

In [21]:
hf = h5py.File(h5_path, 'r+')

metadata = pd.read_csv(dataset_path + 'Meta data/TCGA-BRCA_clinical.csv')
metadata = metadata[columns]

survival_info = pd.DataFrame(columns=['patient ID', 'vital status', 'time'])
survival_info['patient ID'] = metadata[columns[0]]
survival_info['vital status'] = (metadata[columns[1]] == 'dead')*1
survival_info['time'] = metadata[columns[2]].fillna(0) + metadata[columns[3]].fillna(0)

time_dataset = hf.create_dataset("time", [len(ID_dataset)], dtype='i')
status_dataset = hf.create_dataset("status", [len(ID_dataset)], dtype='i')

for i in range(len(ID_dataset)):
  pWSI_name = ID_dataset[i].decode('UTF-8').lower()
  data = survival_info[survival_info['patient ID'] == pWSI_name]

  time_dataset[i] = np.array(data['time'])[0]
  status_dataset[i] = np.array(data['vital status'])[0]
  
  sys.stdout.write(f'\r Charged {i+1} of {len(ID_dataset)}')

 Charged 1038 of 1038