In [6]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

import cv2
import random
import tables
import openslide

In [7]:
def extract_patch(image, x, y, patch_size):
    """
    Extract a patch at a certain position (x,y) of an image, with a certain patch size
    """
    
    patch = np.asarray(image.read_region((y-int(patch_size/2),x-int(patch_size/2)), 0, (patch_size, patch_size)))[:,:,0:3]
    
    return patch
        
def draw_circle(image, x, y, pixel_value = (0,255,0), radius = 10, thickness = 2):
    """
    Draws a circle around a certain position (x,y) of an image
    """
    
    cv2.circle(image, (x,y), radius, pixel_value, thickness)
    
def is_nucleus(image, x, y, threshold, radius = 10):
    """
    Assess if a nucleaus is present
    """
    
    patch = extract_patch(image,x,y,patch_size=radius)
    pixel_values = patch.flatten()
        
    mean = np.mean(pixel_values)
    
    if mean > threshold: 
        return False
    
    else:
        return True
    

In [8]:
# Load the H&E image
HE_open = openslide.OpenSlide('Experiment/HE40X/MELANOMA_AA3_40X/Scan1/MELANOMA_AA3_40X_Scan1.qptiff')

# Read H&E cells locations
df_1 = pd.read_csv('3_MEL3_HE_cells_locations_NoOffset_1.txt', header = None, names = ['x','y','type'], sep='\t')
df_2 = pd.read_csv('3_MEL3_HE_cells_locations_NoOffset_2_5000_12000.txt', header = None, names = ['x','y','type'], sep='\t')
#df_3 = pd.read_csv('3_MEL3_HE_cells_locations_NoOffset_4_26270_11150.txt', header = None, names = ['x','y','type'], sep='\t')

# Merge the positions
Mel3_cells_positions = df_1.merge(df_2, how='outer')
#Mel3_cells_positions.to_csv('MEL3_HE_cells_locations.txt', header=None, index = None, sep='\t', mode='a')

#Mel3_cells_positions = pd.read_csv('MEL3_HE_cells_locations.txt', header = None, names = ['x','y','type'], sep='\t')
Mel3_cells_positions.head()

Unnamed: 0,x,y,type
0,11965,13605,CD3p
1,11945,9636,CD3p
2,11945,9183,CD3p
3,11936,13602,CD3p
4,11930,13734,CD3p


In [9]:
# Count the number of cells per type 
Mel3_cells_positions.groupby(by='type').count()

Unnamed: 0_level_0,x,y
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CD11Cp,4977,4977
CD20p,37339,37339
CD3p,44621,44621
CD56p,4280,4280
CD68p,18092,18092
SOX10p,96340,96340


In [5]:
#phases = ['train','val']

# phenotypes that will be present in the database
phenotypes = ['SOX10p','CD3p','CD68p']

# defines the label associated to each class
phenotype_mask = {'SOX10p':0, 'CD3p': 1, 'CD68p':0}

# number of classes
nclasses = len(set(phenotype_mask.values()))

In [5]:
APC = ['CD11Cp' , 'CD68p']
Immune = ['CD3p']

In [30]:
print("Extracting train dataset... \n")

# size of the patch
patchsize = 40

# number of no-immune cells to be extracted
size_no_Immune = 7000

# number of immune cells to be extracted 
size_Immune = 14000

# total size 
size = int((2*size_no_Immune) + (1*size_Immune))

# open a hdf5_file  in write mode
hdf5_file = tables.open_file(f"dataset/train_{size}_{patchsize}_{nclasses}classes_CD3p_VS_SOX10p_CD68p_triplefiltered.h5", mode="w", title=f"Database test")

# define the shape of a patch, which is its size and the number of channels
patch_shape = np.array((patchsize, patchsize, 3))

filters = tables.Filters(complevel=6, complib='zlib')

# earray for the patch
hdf5_file.create_earray(hdf5_file.root, "patch", tables.UInt8Atom(), shape=np.append([0], patch_shape),
                            chunkshape=np.append([1], patch_shape), filters=filters)

# earray for the label
hdf5_file.create_earray(hdf5_file.root, "label", tables.UInt16Atom(), shape=[0], chunkshape=[1],
                            filters=filters)

# earray for the classsizes (important for neural network)
hdf5_file.create_earray(hdf5_file.root, "classsizes", tables.UInt16Atom(), shape=[0], chunkshape=[1],
                            filters=filters)

# earray for the image ID
hdf5_file.create_earray(hdf5_file.root, "imgID", tables.UInt32Atom(), shape=[0], chunkshape=[1],
                            filters=filters)

# counters 
counter_Immune = 0
counter_no_Immune = 0
    
for phenotype in phenotypes:
        
    print(f"phenotype: {phenotype}")
    
    if phenotype in Immune:
        
        print("--> Immune \n")
        
        for index, row in df_filtered[df_filtered['type']==phenotype].sample(size_Immune).iterrows():
            
            # extract the patch
            patch = extract_patch(HE_open, row['x'], row['y'], patchsize)
            
            # append all the information (patch, associated label, ID)
            hdf5_file.root.patch.append(patch[None, ::])
            hdf5_file.root.label.append([phenotype_mask[phenotype]])
            hdf5_file.root.imgID.append([index])
            
            counter_Immune = counter_Immune + 1
            
            # remove this sample from the dataframe not to take it back after
            df_filtered.drop(index = index, inplace = True)
        
    else: 
        
        print("--> NO Immune \n")
        
        for index, row in df_filtered[df_filtered['type']==phenotype].sample(size_no_Immune).iterrows():
            
            patch = extract_patch(HE_open, row['x'], row['y'], patchsize)
            
            hdf5_file.root.patch.append(patch[None, ::])
            hdf5_file.root.label.append([phenotype_mask[phenotype]])
            hdf5_file.root.imgID.append([index])
            
            counter_no_Immune = counter_no_Immune + 1
            df_filtered.drop(index = index, inplace = True)

# add the classsizes for balancing the classes for training
hdf5_file.root.classsizes.append([counter_no_Immune])
hdf5_file.root.classsizes.append([counter_Immune])

# close the file 
hdf5_file.close()

Extracting train dataset... 

phenotype: SOX10p
--> NO Immune 

phenotype: CD3p
--> Immune 

phenotype: CD68p
--> NO Immune 



In [31]:
print("Extracting val dataset... \n")

size_no_Immune = 2000
size_Immune = 4000
size = int((2*size_no_Immune) + (1*size_Immune))


    
hdf5_file = tables.open_file(f"dataset/val_{size}_{patchsize}_{nclasses}classes_CD3p_VS_SOX10p_CD68p_triplefiltered.h5", mode="w", title=f"Database test")
    
patch_shape = np.array((patchsize, patchsize, 3))
filters = tables.Filters(complevel=6, complib='zlib')

hdf5_file.create_earray(hdf5_file.root, "patch", tables.UInt8Atom(), shape=np.append([0], patch_shape),
                            chunkshape=np.append([1], patch_shape), filters=filters)

hdf5_file.create_earray(hdf5_file.root, "label", tables.UInt16Atom(), shape=[0], chunkshape=[1],
                            filters=filters)
    
hdf5_file.create_earray(hdf5_file.root, "classsizes", tables.UInt16Atom(), shape=[0], chunkshape=[1],
                            filters=filters)
    
hdf5_file.create_earray(hdf5_file.root, "imgID", tables.UInt32Atom(), shape=[0], chunkshape=[1],
                            filters=filters)

counter_Immune = 0
counter_no_Immune = 0
    
for phenotype in phenotypes:
        
    print(f"phenotype: {phenotype}")
    
    if phenotype in Immune:
        
        print("--> Immune \n")
        
        for index, row in df_filtered[df_filtered['type']==phenotype].sample(size_Immune).iterrows():
            
            patch = extract_patch(HE_open, row['x'], row['y'], patchsize)
            
            hdf5_file.root.patch.append(patch[None, ::])
            hdf5_file.root.label.append([phenotype_mask[phenotype]])
            hdf5_file.root.imgID.append([index])
            
            counter_Immune = counter_Immune + 1
            df_filtered.drop(index = index, inplace = True)
                 
    else: 
        
        print("--> NO Immune \n")
        
        for index, row in df_filtered[df_filtered['type']==phenotype].sample(size_no_Immune).iterrows():
            
            patch = extract_patch(HE_open, row['x'], row['y'], patchsize)
            
            hdf5_file.root.patch.append(patch[None, ::])
            hdf5_file.root.label.append([phenotype_mask[phenotype]])
            hdf5_file.root.imgID.append([index])
            
            counter_no_Immune = counter_no_Immune + 1
            df_filtered.drop(index = index, inplace = True)
            
hdf5_file.root.classsizes.append([counter_no_Immune])
hdf5_file.root.classsizes.append([counter_Immune])      
               
hdf5_file.close()

Extracting val dataset... 

phenotype: SOX10p
--> NO Immune 

phenotype: CD3p
--> Immune 

phenotype: CD68p
--> NO Immune 



In [9]:
print("Extracting test dataset... \n")

size_no_Immune = 2700
size_Immune = 5400

size = int((2*size_no_Immune) + (1*size_Immune))


    
hdf5_file = tables.open_file(f"dataset/test_{size}_{patchsize}_{nclasses}classes_CD3p_VS_SOX10p_CD68p.h5", mode="w", title=f"Database test")
    
patch_shape = np.array((patchsize, patchsize, 3))
filters = tables.Filters(complevel=6, complib='zlib')

hdf5_file.create_earray(hdf5_file.root, "patch", tables.UInt8Atom(), shape=np.append([0], patch_shape),
                            chunkshape=np.append([1], patch_shape), filters=filters)

hdf5_file.create_earray(hdf5_file.root, "label", tables.UInt16Atom(), shape=[0], chunkshape=[1],
                            filters=filters)
    
hdf5_file.create_earray(hdf5_file.root, "imgID", tables.UInt32Atom(), shape=[0], chunkshape=[1],
                            filters=filters)

hdf5_file.create_earray(hdf5_file.root, "x", tables.UInt32Atom(), shape=[0], chunkshape=[1],
                            filters=filters)

hdf5_file.create_earray(hdf5_file.root, "y", tables.UInt32Atom(), shape=[0], chunkshape=[1],
                            filters=filters)
    
for phenotype in phenotypes:
        
    print(f"phenotype: {phenotype}")
    
    if phenotype in Immune:
        
        print("--> IM \n")
        
        for index, row in df_filtered[df_filtered['type']==phenotype].sample(size_Immune).iterrows():
            
            patch = extract_patch(HE_open, row['x'], row['y'], patchsize)
            
            hdf5_file.root.patch.append(patch[None, ::])
            hdf5_file.root.label.append([phenotype_mask[phenotype]])
            hdf5_file.root.imgID.append([index])
            hdf5_file.root.x.append([row['x']])
            hdf5_file.root.y.append([row['y']])
            
            
            df_filtered.drop(index = index, inplace = True)
                 
    else: 
        
        print("--> NO IM \n")
        
        for index, row in df_filtered[df_filtered['type']==phenotype].sample(size_no_Immune).iterrows():
            
            patch = extract_patch(HE_open, row['x'], row['y'], patchsize)
            
            hdf5_file.root.patch.append(patch[None, ::])
            hdf5_file.root.label.append([phenotype_mask[phenotype]])
            hdf5_file.root.imgID.append([index])
            hdf5_file.root.x.append([row['x']])
            hdf5_file.root.y.append([row['y']])
            
            df_filtered.drop(index = index, inplace = True)    
               
hdf5_file.close()

Extracting test dataset... 

phenotype: SOX10p
--> NO IM 

phenotype: CD3p
--> IM 

phenotype: CD68p
--> NO IM 

