In [46]:
from utils import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import h5py
from tqdm import tqdm
from sklearn.utils import shuffle

# Problem Statement and Setup
We will be focusing on the Hep G2 cells in the dataset to limit the scope of our project. Depending on our progress, we have a path to extend the project by incorporating other cell types as well.

### Tasks:
1. Separate HEPG2 experiments based t-SNE plots (2 test, 5train).
2. Create a baseline method for siRNA classification.
3. Train model on HEPG2 data.
4. Create denoising procedures. 
5. Train model on denoised data.
6. Compare $\Delta$accuracy. 

#### Train, test splits

Observing the t-SNE plots from previous notebook, we will use HEPG2-02 and HEPG2-04 as our test set. The reasoning behind is, we want our test sets to not be similar to each other as well as from our train sets.

In [2]:
train_experiments = ['HEPG2-01', 'HEPG2-03', 'HEPG2-05', 'HEPG2-06', 'HEPG2-07']
test_experiments = ['HEPG2-02', 'HEPG2-04']

1 input --> vae --> compare to k other experiments and itself

In [3]:
def nCk(n, k):
    return math.factorial(n) // math.factorial(k) // math.factorial(n - k)

In [4]:
train_experiment_count = 5
total_experiment_count = 7
plates_per_experiment = 4
k = 2
sites_per_well = 2
images_per_site = 4
channels_per_image = 6
res = 256
# n_train = (control_sirna_count * train_experiment_count * plates_per_experiment * images_per_well #choose input
#            * nCk(train_experiment_count-1, k) * plates_per_experiment**k * images_per_well)
# print('{:,} unique training examples'.format(n_train))

In [5]:
train_controls_df = pd.read_csv(os.path.join(RECURSION_TRAIN, 'train_controls.csv'))
train_controls_df['cell_line'] = [v[0] for v in train_controls_df.id_code.str.split('-')]
train_controls_df = train_controls_df[train_controls_df['cell_line'] == 'HEPG2']
train_controls_df = train_controls_df[train_controls_df['well_type'] == 'positive_control']
missing_sirna = list(set(train_controls_df.sirna).difference(set(train_controls_df.query('experiment == "HEPG2-07" and plate == 2').sirna.values)))
train_controls_df = train_controls_df.query('sirna not in @missing_sirna')
train_controls_df = train_controls_df.sort_values(by=['sirna', 'experiment', 'plate'])
train_controls_df = train_controls_df.reset_index(drop=True)
sirnas = list(train_controls_df.sirna.unique())
sirnas = {sirna: i for i, sirna in enumerate(sirnas)}
control_sirna_count = len(sirnas)
train_controls_df.head(-30)

Unnamed: 0,id_code,experiment,plate,well,sirna,well_type,cell_line
0,HEPG2-01_1_H22,HEPG2-01,1,H22,1108,positive_control,HEPG2
1,HEPG2-01_2_H03,HEPG2-01,2,H03,1108,positive_control,HEPG2
2,HEPG2-01_3_M07,HEPG2-01,3,M07,1108,positive_control,HEPG2
3,HEPG2-01_4_M15,HEPG2-01,4,M15,1108,positive_control,HEPG2
4,HEPG2-02_1_C22,HEPG2-02,1,C22,1108,positive_control,HEPG2
...,...,...,...,...,...,...,...
777,HEPG2-06_2_F22,HEPG2-06,2,F22,1136,positive_control,HEPG2
778,HEPG2-06_3_J11,HEPG2-06,3,J11,1136,positive_control,HEPG2
779,HEPG2-06_4_H11,HEPG2-06,4,H11,1136,positive_control,HEPG2
780,HEPG2-07_1_F03,HEPG2-07,1,F03,1136,positive_control,HEPG2


In [6]:
images = np.empty((control_sirna_count, total_experiment_count, plates_per_experiment, 
                        sites_per_well, images_per_site, res, res, channels_per_image), dtype=np.float32)
for i, row in tqdm(train_controls_df.iterrows()):
    sirna = sirnas[row.sirna]
    exp = int(row.experiment[-1:]) - 1
    plate = row.plate - 1
    for site in range(sites_per_well):
        for ch in range(channels_per_image):
            path = get_image_path(row.experiment, row.plate, row.well, site+1, ch+1, train=True)
            img = read_image(path)
            img_1 = img[None, :256, :256]
            img_2 = img[None, :256, -256:]
            img_3 = img[None, -256:, :256]
            img_4 = img[None, -256:, -256:]
            img = np.concatenate((img_1, img_2, img_3, img_4))
            images[sirna, exp, plate, site, :, :, :, ch] = img

812it [00:42, 19.30it/s]


### Exporting to h5 and npy files

In [14]:
images.shape[:5]

(29, 7, 4, 2, 4)

In [13]:
np.product(images.shape[:5])

6496

In [41]:
dataset_x = images.reshape(np.product(images.shape[:5]), 256, 256, 6)
dataset_y = np.repeat(np.arange(29, dtype=np.int32), np.product(images.shape[1:5]))

In [48]:
dataset_x, dataset_y = shuffle(dataset_x, dataset_y)

In [None]:
np.save(os.path.join(RECURSION_DIR, 'npy', 'train-controls-f32.npy'), images, allow_pickle=False)

In [43]:
hf = h5py.File(os.path.join(RECURSION_DIR, 'h5', 'train-controls-f32.h5'), 'w')

In [44]:
hf.create_dataset('structured', data=images)

<HDF5 dataset "structured": shape (29, 7, 4, 2, 4, 256, 256, 6), type "<f4">

In [51]:
hf.create_dataset('dataset_x', data=dataset_x)
hf.create_dataset('dataset_y', data=dataset_y)

<HDF5 dataset "dataset_y": shape (6496,), type "<i4">

In [52]:
hf.close()