In [None]:
import json
import os
from pathlib2 import Path

import numpy as np
import pandas as pd
import h5py as h5
import tqdm
from skimage.io import imsave
from skimage.exposure import rescale_intensity

In [None]:
# combine all samples < Oct. 2021 with newest samples

df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20210903_glcm_all_extrafeats.csv')
df_new = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20211022_glcm_all_extrafeats.csv')

In [None]:
with open('/scratch/hoerl/auto_sir_dna_comp/sorting20210316.json', 'r') as fd:
    sorting_dict = json.load(fd)
sorting_dict

def get_classification_from_dict(row, sorting_dict):
    filename = os.path.split(row.filename)[1].replace('.h5', '')
    dataset_name = row.dataset_name

    if [filename, dataset_name] in sorting_dict['good']:
        return 'good'
    elif [filename, dataset_name] in sorting_dict['bad']:
        return 'bad'
    else:
        return 'unclassified'

df['classification_manual'] = df.apply(lambda row: get_classification_from_dict(row, sorting_dict), 1)

In [None]:
outdir = Path('/scratch/hoerl/auto_sir_dna_comp/sorting20211108')

if not outdir.exists():
    outdir.mkdir()
    (outdir / 'good').mkdir()
    (outdir / 'bad').mkdir()

In [None]:
for i, r in tqdm.tqdm(df.iterrows(), total=len(df)):
    if r.classification_manual != 'unclassified':
        
        # load from h5
        with h5.File(r.filename, 'r') as fd:
            img = fd[f'experiment/{r.dataset_name}/0/0'][...].squeeze()
        
        # rescale as we did in original feature ext
        percs = (r.perc_low, r.perc_high)
        img = rescale_intensity(img, percs, 'uint8').astype(np.uint8)
        
        out_filename = Path(r.filename).name.replace('.h5', '_') + r.dataset_name + '.png'
        imsave(str(outdir / r.classification_manual / out_filename), img)

In [None]:
to_sample_total = 500
sampled_old = 250

In [None]:
# save new sample
new_sample = df[df.classification_manual == 'unclassified'].append(df_new).sample(to_sample_total - sampled_old)

for i, r in tqdm.tqdm(new_sample.iterrows(), total=(to_sample_total - sampled_old)):
    # load from h5
    with h5.File(r.filename, 'r') as fd:
        img = fd[f'experiment/{r.dataset_name}/0/0'][...].squeeze()

    # rescale as we did in original feature ext
    percs = (r.perc_low, r.perc_high)
    img = rescale_intensity(img, percs, 'uint8').astype(np.uint8)

    out_filename = Path(r.filename).name.replace('.h5', '_') + r.dataset_name + '.png'
    imsave(str(outdir / out_filename), img)