#### Training Image Filename Organization

This notebook is used to reflect chip type differences in names of marker model training dataset images

In [2]:
import os
import os.path as osp
import celldom
import pandas as pd
marker_data_dir = osp.join(celldom.get_dataset_dir(), 'training', 'marker', 'r0.6')

In [11]:
#!ls $marker_data_dir/annotations

In [9]:
res = []
for f in os.listdir(marker_data_dir):
    if not f.endswith('.tif'):
        continue
    path = osp.join(marker_data_dir, f)
    annot_file = f.replace('.tif', '.xml')
    annot_path = osp.join(marker_data_dir, 'annotations', annot_file)
    
    ctyp = None
    if f.startswith('BFF_'):
        ctyp = 'G03_'
    elif f.startswith('BF_'):
        ctyp = 'G02_'
    else:
        raise ValueError('No chip type known for file {}'.format(f))
        
    pdest = osp.join(marker_data_dir, ctyp + f)
    res.append('mv {} {}'.format(path, pdest))
    if osp.exists(annot_path):
        pdest = osp.join(marker_data_dir, 'annotations', ctyp + annot_file)
        res.append('mv {} {}'.format(annot_path, pdest))
script = '\n'.join(res)
with open('/tmp/renames.sh', 'w') as fd:
    fd.write(script)

In [12]:
#!cat /tmp/renames.sh

#### G1 Chip Files

In [13]:
data_dir = osp.join(celldom.get_dataset_dir(), 'dataset06')
data_dir

'/lab/data/celldom/dataset/dataset06'

In [21]:
dt = pd.to_datetime('now')

In [22]:
dt.strftime('%Y')

'2018'

In [27]:
res = []
for f in os.listdir(data_dir):
    if not f.endswith('hr'):
        continue
    date = f.split()[0].replace('.', '')
    hr = int(f.split()[-2])
    # Add hours to base date and convert to date string
    dt = pd.to_datetime(date) + pd.to_timedelta('{} hr'.format(hr))
    dt = dt.strftime('%Y%m%d%H%M')
    res.append('mv "{}" "{}"'.format(osp.join(data_dir, f), osp.join(data_dir, f + ' ' + dt)))
script = '\n'.join(res)
print(script)

mv "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr" "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000"
mv "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 100 hr" "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 100 hr 201802210400"
mv "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 117 hr" "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 117 hr 201802212100"
mv "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 17 hr" "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 17 hr 201802171700"
mv "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 41 hr" "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 41 hr 201802181700"
mv "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 63 hr" "/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 63 hr 201802191500"
mv "/lab/data/celldom/data

### Training Data

In [29]:
import glob
files = glob.glob(osp.join(data_dir, '*/*.tif'))
len(files)

576

In [30]:
files[:10]

['/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_000_Apt_004_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_000_Apt_008_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_000_Apt_020_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_002_Apt_012_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_002_Apt_016_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_002_Apt_024_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_004_Apt_000_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17 G1 35 K562 Test 2 0 hr 201802170000/BF_16X_St_004_Apt_004_F_000.tif',
 '/lab/data/celldom/dataset/dataset06/2018.02.17

In [31]:
rfiles = pd.Series(files).sample(n=100, random_state=1)
len(rfiles)

100