# This is an interactive file for transforming the raw t-cell/dcell dataset into compressed files.

## The issue comes from high RAM usage both locally and in Google Colab.

## Input format:
- A folder containing images 
- Each image has a counterpart: for each "filename" (letter - digit), there is a red image, and a green image
- red image = tcell
- green image = dendritic cell
- we need both the separated images (B&W) and the combined images (RGB)
- Each image is 2048x2048 8MB TIFF image

## Steps
1. Pass a 192x192 sliding window over the images. 
2. Store the filenames
3. Take each of the reduced images, and combine them to create RGB images (red channel = tcell, green channel = dcell)
4. Calculate the intersection over union overlap for each image and store it in a file.

## Output:
**DATA_full.npz**
- x: raw images 
- y: raw filename names (there will be duplicates)   

**DATA_overlaps.npz**
- y_overlaps : overlap value for RGB image 

### The reason I am not combining tcell+dcell in a RGB dataset is because doing that resulted in a 13Gig dataset (3 times as big) that I cannot send to Google Drive, which I use to train models with a GPU through Colab.
### Hence the original, unmodified images need to be kept to be processed in-notebook. 

## Step 1: pass a sliding window over the images

In [1]:
from dataset_helpers import read_folder_filenames
from config import repo_path, imw

from skimage.io import imread
import numpy as np

In [7]:
def sliding_window(img, dest_size, rgb=False):
    """
    This function passes a sliding window over an image
    and returns sub-images
    --> more detail
    --> more training data
    """

    new_img = np.full_like(img, img)

    size = img.shape[0]
    if dest_size > size or dest_size % 2 != 0:
        raise Exception(
            "destination size is bigger than picture size or destination size is not even")

    qty = size // dest_size
    if size % dest_size != 0:
        # need to crop out the left and bottom (less significant in dataset)
        crop = size - dest_size * qty
        new_img = new_img[crop:, :-crop]

    if rgb:
        windows = np.ndarray(
            shape=(qty**2, dest_size, dest_size, 3), dtype=np.uint16)
    else:
        windows = np.ndarray(
            shape=(qty**2, dest_size, dest_size), dtype=np.uint16)

    i = 0
    for row in range(qty):
        y = row * dest_size
        x = 0
        for col in range(qty):
            #print("x:coord {},{} - y:coord {},{}".format(x, x+dest_size, y, y+dest_size))
            windows[i] = new_img[x:x + dest_size, y:y + dest_size]
            x += dest_size
            i += 1

    return windows

In [8]:
def compress_images(out, filenames, size):
    """
    returns:
    a npz file of:
     - image arrays in shape (size, size, 1)
     - filenames (unmodified)
    
    @parameters:
    out = name of the outputted compressed file
    filenames = all filenames of files to compress
    size = size of output images 
    
    
    @assumptions:
    * validity of filenames has been checked
    """
    
    compressed = []
    fn = []
    
    for file in filenames:
        img = imread(file)
        windows = sliding_window(img, size)
        img = None
        for img in windows:
            compressed.append(img)
            fn.append(file)
            img = None
        windows = None
    
    compressed = np.array(compressed)
    fn = np.array(fn)
    np.savez(out, x=compressed, y=fn)
    
    print("All files compressed into %s" % out)

### Loading filenames

In [9]:
filenames = sorted(read_folder_filenames(repo_path + 'data/sample_data/raw/images'))

In [10]:
compress_images(repo_path + 'data/sample_data/processed/sample_images.npz', filenames, imw)

All files compressed into /Users/Leonore/Documents/Workspace/l4proj/data/sample_data/processed/sample_images.npz


## Step 2: Capture overlap metrics from combined images.

In [11]:
npzfile = np.load(repo_path + 'data/sample_data/processed/sample_images.npz')

In [12]:
x = npzfile['x']
filenames = npzfile['y']

In [13]:
len(x)

3600

In [14]:
from dataset_helpers import combine_images
from segmentation import get_mask, iou

In [15]:
# combined images, associated label
x_combined, y_combined = combine_images(x, filenames, mask=False)

Images preprocessed. Size of dataset: 1800


In [16]:
def get_overlaps(x, y):
    overlaps = np.ndarray(shape=(len(x),), dtype=np.float32) # overlap values - combined

    # initialise index values
    i = 0

    print("Looping through images...")
    while i < len(x):
        if y[i] == 3:
            # image is faulty
            overlaps[i] = 0
        else:
            overlaps[i] = iou(get_mask(x[i, ..., 1]), get_mask(x[i, ..., 0]))

        i += 1
    return overlaps

In [17]:
y_overlaps = get_overlaps(x_combined, y_combined)

Looping through images...


In [18]:
out = repo_path + 'data/sample_data/processed/sample_overlaps.npz'

In [19]:
np.savez(out, overlaps=y_overlaps)