**Changes to original notebook:**
1. using "benign_objects" to denote objects that should not indicate the
presence of cancer in a patch.
  i. represented graphically with blue circles.

2. using "cancerous_objects" to denote objects that should indicate the presence
of cancer in a patch.
  ii. represented graphically with red circles.

3. changed the max and min diameter and # of objects to more closely matches the
TMA dataset.

4. altered code to accomodate 3D synthetic images (RGB).

5. added visualization scripts from "visualize densenet" file

In [None]:
import random
import tables
import sys

import torch
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image, ImageDraw

from snippets import print_bag

seed = random.randrange(sys.maxsize) #get a random seed so that we can reproducibly do the cross validation setup
random.seed(seed) # set the seed
print(f"random seed (note down for reproducibility): {seed}")

In [None]:
dataname="MIL_32x32_100pos"

patch_size= 32 #size of the tiles to put into DB
data_size=np.array([10000, 3000])

num_slides = np.floor(data_size / 244).astype(int)	# the expected ratio of slides to patches is 1:244. (4000^2 / 256^2) So the number of slides should be num_patches/244
train_slide_ids = np.arange(num_slides[0])
val_slide_ids = np.arange(num_slides[1])

classes=[0,1] #what classes we expect to have in the data.
percent_pos_per_slide=1.0

max_benign_objects= 3
max_cancerous_objects=1
diameter_min=5
diameter_max=10

phases=["train","val"]

In [None]:
%cd data

In [None]:
img_dtype = tables.UInt8Atom()  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]

In [None]:
%matplotlib inline
storage={} #holder for future pytables

block_shape=np.array((patch_size,patch_size, 3)) #block shape specifies what we'll be saving into the pytable array, here we assume that masks are 1d and images are 3d

filters=tables.Filters(complevel=6, complib='zlib') #we can also specify filters, such as compression, to improve storage speed


for phase,nimgs in zip(phases,data_size): #now for each of the phases, we'll loop through the files
	print(phase)
	
	totals=np.zeros(2) # we can to keep counts of all the classes in for in particular training

	hdf5_file = tables.open_file(f"./{dataname}_{phase}.pytable", mode='w') #open the respective pytable


	storage["imgs"]= hdf5_file.create_earray(hdf5_file.root, "imgs", img_dtype,  
											shape=np.append([0],block_shape), 
											chunkshape=np.append([1],block_shape),
											filters=filters)
	storage["labels"]= hdf5_file.create_earray(hdf5_file.root, "labels", img_dtype,  
											shape=[0], 
											chunkshape=[1],
											filters=filters)
	storage["slide_ids"]= hdf5_file.create_earray(hdf5_file.root, "slide_ids", img_dtype,
											shape=[0],
											chunkshape=[1],
											filters=filters)

	
	for filei in range(nimgs): #now for each of the files
		print(filei)
		img=np.zeros((patch_size,patch_size))
		img = Image.fromarray(img, mode="RGB")
		draw = ImageDraw.Draw(img)
		
		#draw benign objects on the image.
		for i in range(np.random.randint(1,high=max_benign_objects)):
			d=np.random.randint(diameter_min,diameter_max)
			squeeze_constant = np.random.randint(-d/2, d/2)
			ul=np.random.randint(diameter_min,patch_size-diameter_max,2)
			point2=ul + d
			point2[0] = point2[0] + squeeze_constant
			
			# 3 varieties of benign objects
			variety = np.random.randint(0,3)
			if variety == 0:  # draw blue circle
				draw.ellipse(list(np.append(ul,ul+d)),fill=(0,0,255))
			elif variety == 1:  # draw blue ovals
				draw.ellipse(list(np.append(ul,point2)),fill=(0,0,255))
			elif variety == 2:  # draw red circle
				draw.ellipse(list(np.append(ul,ul+d)),fill=(255,0,0))
	
		slide_id = None
		label = None

		#fairly assign a slide-level id to each patch according to its bag
		if phase == 'train':
			slide_id = train_slide_ids[np.random.randint(0, num_slides[0])]	# random slide_id
			if slide_id < num_slides[0]/2:
				label = 0
			else:
				label = 1
		elif phase == 'val':
			slide_id = val_slide_ids[np.random.randint(0,num_slides[1])]
			if slide_id < num_slides[1]/2:
				label = 0
			else:
				label = 1
		
		
		if label:
			if np.random.random() <= percent_pos_per_slide:		# controls the percentage of cancerous patches per slide
				for i in range(np.random.randint(1,high=max_cancerous_objects+1)):
					d=np.random.randint(diameter_min,diameter_max)
					squeeze_constant = np.random.randint(-d/2,d/2)
					ul=np.random.randint(diameter_min,patch_size-diameter_max,2)
					point2=ul + d
					point2[0] = point2[0] + squeeze_constant
					draw.ellipse(list(np.append(ul,point2)),fill=(255,0,0))#red ellipse represents benign
				totals[1]+=1
		else:
			totals[0]+=1
			#add cancerous object to total
		
		del draw

		storage["imgs"].append(np.array(img)[None,::])
		storage["labels"].append([np.uint8(label)])
		storage["slide_ids"].append([np.uint8(slide_id)])
		
	#lastly, we should store the number of pixels
	npixels=hdf5_file.create_carray(hdf5_file.root, 'classsizes', tables.Atom.from_dtype(totals.dtype), totals.shape)
	npixels[:]=totals
	hdf5_file.close()
	
print("done")

In [None]:
print(totals)

Visualization

In [None]:
phase="train"
db=tables.open_file(f"./{dataname}_{phase}.pytable")
imgid=4998
img = db.root.imgs[imgid,::]
#label = torch.tensor(db.root.labels[imgid])
label = torch.tensor(np.array(db.root.labels[imgid]))
slide_id = torch.tensor(np.array(db.root.slide_ids[imgid]))
#img = img[:,:,None].repeat(3,axis=2) #convert to 3 channel
plt.imshow(img)
print(label)
print(slide_id)

In [None]:
phase="train"
db=tables.open_file(f"./{dataname}_{phase}.pytable")

classbalance = [0,0]
for imgid in range(0, data_size[0]):
    label = torch.tensor(np.array(db.root.labels[imgid])).item()
    if label:
        classbalance[0] = classbalance[0] + 1
    else:
        classbalance[1] = classbalance[1] + 1
print(classbalance)

In [None]:
print_bag(dataname, 'train', 64, figsize=(20,20),child_directory='')