# This is an interactive file for transforming the raw t-cell/dcell dataset into compressed files.

## Input format:
- A folder containing images 
- Each image has a counterpart: for each "filename" (letter - digit), there is a red image, and a green image
- red image = tcell
- green image = dendritic cell
- we need both the separated images (B&W) and the combined images (RGB)
- Each image is 2048x2048 8MB TIFF image

## Steps
1. Pass a 192x192 sliding window over the images. 
2. Store the filenames
3. Take each of the reduced images, and combine them to create RGB images (red channel = tcell, green channel = dcell)
4. While doing this process, store the corresponding category labels in a file and calculate the intersection over union overlap.

## Output:
**DATA_full.npz**
- x: raw images 
- y: raw filename names (there will be duplicates)   

**DATA_metrics.npz**
- y_combined : label for corresponding filenames (Unstimulated, OVA, ConA, Faulty)
- y_overlaps : overlap value for RGB image 
- y_no_faulty : label for corresponding filenames, but Faulty images are not singled out 

### The reason I am not combining tcell+dcell in a RGB dataset is because doing that resulted in a 13Gig dataset (3 times as big) that I cannot send to Google Drive, which I use to train models with a GPU through Colab.
### Hence the original, unmodified images need to be kept to be processed in-notebook. 

## Step 1: pass a sliding window over the images

In [1]:
from dataset_helpers import read_folder_filenames, is_dmso
from dataset_helpers import sliding_window

from skimage.io import imread
import numpy as np

In [None]:
def compress_images(out, filenames, size):
    """
    returns:
    a npz file of:
     - image arrays in shape (size, size, 1)
     - filenames (unmodified)
    
    @parameters:
    out = name of the outputted compressed file
    filenames = all filenames of files to compress
    size = size of output images 
    
    
    @assumptions:
    * validity of filenames has been checked
    """
    
    compressed = []
    fn = []
    
    for file in filenames:
        img = imread(file)
        windows = sliding_window(img, size)
        img = None
        for img in windows:
            compressed.append(img)
            fn.append(file)
            img = None
        windows = None
    
    compressed = np.array(compressed)
    fn = np.array(fn)
    np.savez(out, x=compressed, y=fn)
    
    print("All files compressed into %s" % out)

### Loading filenames

In [None]:
#filenames = sorted(read_folder_filenames(your_folder_here))

In [None]:
folder = '/Volumes/TARDIS/'

In [None]:
CK19_files = sorted(read_folder_filenames(folder+"CK19"))
CK21_files = sorted(read_folder_filenames(folder+"CK21"))
CK22_files = sorted(read_folder_filenames(folder+"CK22"))

In [None]:
all_files = sorted([CK19_files, CK21_files, CK22_files])

In [None]:
DMSO_files = []
for file in all_files:
    if is_dmso(file):
        DMSO_files.append(file)

### Take into consideration before running:
* this will take A LOT of memory
* laptop struggles on 8MB of ram


In [None]:
#compress_images("your_output", your_files, your_size)

In [None]:
compress_images("/Volumes/TARDIS/CK19_full.npz", CK19_files, 192)

In [None]:
compress_images("/Volumes/TARDIS/DMSO_full.npz", DMSO_files, 192)

In [None]:
compress_images("/Volumes/TARDIS/CK22_full.npz", CK22_files, 192)

## Step 2: Capture metrics from combined images.

In [None]:
#npzfile = np.load("your_npz_file")

In [17]:
npzfile = np.load("/Volumes/TARDIS/CK22_full.npz")

In [18]:
x = npzfile['x']
filenames = npzfile['y']

In [19]:
from dataset_helpers import is_faulty, minmax, low_clip, get_label
from segmentation import get_mask, iou

In [13]:
def capture_metrics(x, y):
    assert len(x) == len(y)
    y_combined = np.ndarray(shape=(len(x)//2), dtype=np.uint32) # labels - combined
    y_no_faulty = np.ndarray(shape=(len(x)//2), dtype=np.uint8) # labels without faulty - combined
    overlaps = np.ndarray(shape=(len(x)//2), dtype=np.float32) # overlap values - combined

    # initialise index values
    idx = 0
    i = 0
    count = 0

    print("Looping through images...")
    while idx < len(x)-100:
        # ignore 100, 300, etc. values as they will already have been processed
        if count == 100:
            count = 0
            idx += 100
        else:

            if is_faulty(x[idx]) or is_faulty(x[idx+100]):
                y_combined[i] = 3
                y_no_faulty[i] = get_label(y[idx])
                overlaps[i] = 0
            else:
                y_combined[i] = get_label(y[idx])
                y_no_faulty[i] = get_label(y[idx])
                tcell = get_mask(minmax(low_clip(x[idx].astype(np.float32))))
                dcell = get_mask(minmax(low_clip(x[idx + 100].astype(np.float32))))
                overlaps[i] = iou(tcell,dcell)

            tcell = None
            dcell = None
            
            x[idx] = 0
            x[idx+100] = 0

            i += 1
            idx += 1
            count += 1
    return y_combined, y_no_faulty, overlaps

In [None]:
y_combined, y_no_faulty, y_overlaps = capture_metrics(x, filenames)

Looping through images...


In [None]:
#out = "your_out_file"

In [None]:
out = "/Volumes/TARDIS/CK22_metrics.npz"

In [None]:
np.savez(out, y_combined=y_combined, y_no_faulty=y_no_faulty, y_overlaps=y_overlaps)