# Cleaning the predictions dataframe & slice out the targets
There was probably a better way to save the predictions.

### Import things

In [1]:
import os
import cv2
import numpy as np
import pandas as pd

import tifffile as tiff
from PIL import Image, ImageDraw
from torchvision.transforms import ToPILImage  
import matplotlib.pyplot as plt

In [2]:
# make a new folder for the sliced targets
try:
    os.mkdir('../test/boxed')
except:
    pass

### Load dataset

In [3]:
df = pd.read_csv('../CSVs/test_preds.csv')

In [4]:
# we don't need the labels column
df = df.drop('labels', axis = 1)

In [5]:
df.head()

Unnamed: 0,filename,boxes,scores
0,26dc41664_0_0.tiff,"tensor([[ 21.2187, 20.8178, 348.1617, 579.371...","tensor([0.1147], device='cuda:0')"
1,26dc41664_0_10576.tiff,"tensor([], device='cuda:0', size=(0, 4))","tensor([], device='cuda:0')"
2,26dc41664_0_11237.tiff,"tensor([[130.6645, 587.1138, 219.9371, 594.747...","tensor([0.1392, 0.1108, 0.0629], device='cuda:0')"
3,26dc41664_0_11898.tiff,"tensor([[106.1907, 434.3074, 380.4201, 596.000...","tensor([0.1193], device='cuda:0')"
4,26dc41664_0_12559.tiff,"tensor([], device='cuda:0', size=(0, 4))","tensor([], device='cuda:0')"


### Cleaning

The boxes and scores columns are strings of tensor arrays.  
I need to turn them into lists of lists.

    1. split by '[', get rid of index 0
    2. join by '['
    3. split again by ']', get rid of last index
    4. join by ']'
    5. get rid of extra spaces
    6. get rid of the '\n'
    7. use eval() on the columns 

In [6]:
# steps 1 - 6 for boxes
df['boxes'] = df['boxes'].map(lambda x:']'.join('['.join(x.split('[')[1:]).split(']')[:-1]).replace(' ', '').replace('\n', ' '))

In [7]:
# steps 1 - 6 for scores
df['scores'] = df['scores'].map(lambda x: ']'.join('['.join(x.split('[')[1:]).split(']')[:-1]).replace(' ', '').replace('\n', ' '))

In [8]:
# step 7 for boxes
l = []
for rows in range(len(df['boxes'])):
    e = df['boxes'][rows]
    if len(df['boxes'][rows]) > 0:
        e = list(eval(df['boxes'][rows]))
    l.append(e)

df['boxes'] = l

In [9]:
# drop the boxes with no targets
df = df[df['boxes'] != '']

# reset the index
df = df.reset_index(drop=True)

### Save the new clean predictions dataframe

In [10]:
# turn the strings in the scores column into floats
df['scores'] = df['scores'].map(lambda x: [float(y) for y in x.split(',')])

In [11]:
df.to_csv('../CSVs/preds_clean.csv', index=False)

### Making a new dataframe of just the targets

In [12]:
df.head()

Unnamed: 0,filename,boxes,scores
0,26dc41664_0_0.tiff,"[21.2187, 20.8178, 348.1617, 579.3714]",[0.1147]
1,26dc41664_0_11237.tiff,"[[130.6645, 587.1138, 219.9371, 594.747], [117...","[0.1392, 0.1108, 0.0629]"
2,26dc41664_0_11898.tiff,"[106.1907, 434.3074, 380.4201, 596.0]",[0.1193]
3,26dc41664_0_1322.tiff,"[[0.0, 317.703, 294.4255, 581.2195], [559.3757...","[0.1837, 0.1632, 0.0716]"
4,26dc41664_0_13220.tiff,"[0.0, 100.0075, 80.7845, 185.2113]",[0.0932]


In [13]:
new_df = []

for idx in range(3073):
    img_name = df.loc[idx][0]
    path = os.path.join('../test/images/slices', img_name)
    img_array = tiff.imread(path)

    for ele in range(len(df.loc[idx]['scores'])):
        d = {}
        if len(df.loc[idx]['scores']) == 1:
            score = np.round(df.loc[idx]["scores"][ele], decimals= 4)
            boxes = df.loc[idx]['boxes']
        else:
            score = np.round(df.loc[idx]["scores"][ele], decimals= 4)
            boxes = df.loc[idx]['boxes'][ele]
        
        if score > 0.9:
            d['filename'] = img_name
            d['img_size'] = img_array.shape
            d['target'] = ele
            d['xmin'] = boxes[0]
            d['ymin'] = boxes[1]
            d['xmax'] = boxes[2]
            d['ymax'] = boxes[3]
            d['boxes'] = boxes
            d['scores'] = score
            
            new_df.append(d)

In [14]:
targets = pd.DataFrame(new_df)

### Add original filenames and original size columns

In [15]:
originals = sorted(os.listdir('../test/images'))[0:10:2]

In [16]:
original_sizes = []
for i in originals:
    d = {}
    path = os.path.join('../test/images/', i)
    array = tiff.imread(path)
    if len(array.shape) == 5:
        array = array.squeeze().transpose(1, 2, 0)
    d['filename'] = i[:-5]
    d['original_size'] = array.shape[:-1]
    original_sizes.append(d)

In [17]:
real_sizes=pd.DataFrame(original_sizes)

In [18]:
targets['original_filename'] = targets['filename'].map(lambda x: x[:9])

In [19]:
targets.head(10)

Unnamed: 0,filename,img_size,target,xmin,ymin,xmax,ymax,boxes,scores,original_filename
0,26dc41664_10132_1322.tiff,"(596, 661)",0,630.2668,312.7975,660.674,461.6491,"[630.2668, 312.7975, 660.674, 461.6491]",0.9509,26dc41664
1,26dc41664_10132_13220.tiff,"(596, 661)",0,312.5899,256.57,518.7542,445.807,"[312.5899, 256.57, 518.7542, 445.807]",0.985,26dc41664
2,26dc41664_10132_13881.tiff,"(596, 661)",0,560.76,104.6,659.35,320.72,"[560.76, 104.6, 659.35, 320.72]",0.9953,26dc41664
3,26dc41664_10132_13881.tiff,"(596, 661)",1,304.91,291.88,464.35,421.01,"[304.91, 291.88, 464.35, 421.01]",0.992,26dc41664
4,26dc41664_10132_13881.tiff,"(596, 661)",2,3.0319,69.226,130.91,314.87,"[3.0319, 69.226, 130.91, 314.87]",0.9106,26dc41664
5,26dc41664_10132_14542.tiff,"(596, 661)",0,0.0,115.7061,139.6929,316.9232,"[0.0, 115.7061, 139.6929, 316.9232]",0.9832,26dc41664
6,26dc41664_10132_15203.tiff,"(596, 661)",0,238.0342,89.8419,459.9461,284.308,"[238.0342, 89.8419, 459.9461, 284.308]",0.9964,26dc41664
7,26dc41664_10132_16525.tiff,"(596, 661)",0,315.1032,527.9118,540.7087,595.5145,"[315.1032, 527.9118, 540.7087, 595.5145]",0.951,26dc41664
8,26dc41664_10132_17847.tiff,"(596, 661)",0,211.731,252.49,418.7977,425.9188,"[211.731, 252.49, 418.7977, 425.9188]",0.9943,26dc41664
9,26dc41664_10132_17847.tiff,"(596, 661)",1,105.11,543.1493,252.5496,596.0,"[105.11, 543.1493, 252.5496, 596.0]",0.964,26dc41664


In [20]:
targets = pd.merge(targets, real_sizes, left_on='original_filename', right_on='filename')

In [21]:
targets = targets.drop('filename_y', axis=1)

In [22]:
targets = targets.rename(columns = {'filename_x': 'filename'})

In [23]:
targets.to_csv('../CSVs/targets.csv', index=False)

### Slice out targets and save the images

In [24]:
for idx in range(len(targets)):
    img = targets.loc[idx]['filename']
    img_name = targets.loc[idx]['filename'][:-5]

    path = os.path.join('../test/images/slices', img)
    img_array = tiff.imread(path)
    image = Image.fromarray(img_array)

    xmin = int(targets.loc[idx]['xmin'])
    ymin = int(targets.loc[idx]['ymin'])
    xmax = int(targets.loc[idx]['xmax'])
    ymax = int(targets.loc[idx]['ymax'])

    Image.fromarray(np.array(image)[ymin:ymax, xmin:xmax]).save(f'../test/boxed/{img}_tgt_{idx}.tiff')