In [1]:
import os, glob, shutil, json, csv
import numpy as np
import skimage.io as sio

### Copy images

In [2]:
inp_dir = '/data4/plankton_wi17/plankton/images_orig'
out_dir = 'rawcolor_db/images'
specimen_list = []
image_list = {}
for session in sorted(os.listdir(inp_dir)):
    session_dir = os.path.join(inp_dir, session)
    if not os.path.isdir(session_dir):
        continue
    for specimen in sorted(os.listdir(session_dir)):
        specimen_dir = os.path.join(session_dir, specimen)
        if not os.path.isdir(specimen_dir):
            continue
        images = glob.glob(specimen_dir+'/0000000_static_html/images/*/*_rawcolor.png')
        
        images2 = {}
        for fn in images:
            iid = '_'.join(fn.split('/')[-1].split('-')[1:3])
            if iid not in images2:
                images2[iid] = []
            images2[iid].append(fn)
        
        images3 = []
        for iid in images2:
            if len(images2[iid]) == 1:
                images3.append(images2[iid][0])
                
        if len(images3) < 200:
            continue
        
        spc = '{}_{}'.format(session, specimen)
        specimen_list.append(spc)
        image_list[spc] = images3
        

if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
for i, spc in enumerate(specimen_list):
    print '{:03d}/{:03d}: {}'.format(i+1, len(specimen_list), spc)
    spc_dir = os.path.join(out_dir, spc)
    if not os.path.isdir(spc_dir):
        os.makedirs(spc_dir)
    for src in image_list[spc]:
        dest = os.path.join(spc_dir, src.split('/')[-1])
        shutil.copyfile(src, dest)

001/132: 20170124_001
002/132: 20170124_002
003/132: 20170126_001
004/132: 20170126_002
005/132: 20170126_003
006/132: 20170130_001
007/132: 20170130_002
008/132: 20170130_003
009/132: 20170131_001
010/132: 20170131_002
011/132: 20170203_001
012/132: 20170203_002
013/132: 20170206_001
014/132: 20170206_002
015/132: 20170207_001
016/132: 20170207_003
017/132: 20170207_004
018/132: 20170207_006
019/132: 20170209_001
020/132: 20170209_002
021/132: 20170209_003
022/132: 20170210_001
023/132: 20170210_002
024/132: 20170213_001
025/132: 20170213_002
026/132: 20170213_003
027/132: 20170214_001
028/132: 20170214_002
029/132: 20170214_003
030/132: 20170214_004
031/132: 20170214_005
032/132: 20170214_006
033/132: 20170216_001
034/132: 20170216_002
035/132: 20170216_003
036/132: 20170216_004
037/132: 20170216_005
038/132: 20170216_006
039/132: 20170217_001
040/132: 20170217_002
041/132: 20170217_003
042/132: 20170217_004
043/132: 20170217_005
044/132: 20170217_006
045/132: 20170217_007
046/132: 2

### Create metadata

In [4]:
INP_DIR = '/data4/plankton_wi17/plankton/images_orig/'
OUT_DIR = 'rawcolor_db/meta'
def convert_taffy2json(inp_fn, out_fn):
    # I looked for some 3rd-party reader for taffy but could not find any.
    # This probably needs to be improved, if we start getting errors.
    # However, it is working for now.
    import json
    import re
    database = ''.join([l for l in open(inp_fn)])
    database = re.split('TAFFY\(|\)', database)[1].splitlines()
    
    head = 0
    entry_list = []
    while head < len(database) - 1:
        head += 1
        line = database[head].strip()
        if line.startswith('{'):
            entry = {}
            while True:
                head += 1
                line = database[head].strip()
                if not line:
                    continue
                if line.startswith('}'):
                    break
                spt = line.split(':')
                key = spt[0]
                val = ''.join(spt[1:]).strip().split(',')[0]
                if val.startswith("'"):
                    val = val[1:-1]
                else:
                    val = float(val)
                entry[key] = val
            entry_list.append(entry)
    json.dump(entry_list, open(out_fn, 'w'))

# Load taxonomy labels
taxonomy = open('specimen_taxonomy.txt').read().splitlines()
taxonomy = [entry.split('\t') for entry in taxonomy[1:]]
order = {e[0]: e[1] for e in taxonomy}
family = {e[0]: e[2] for e in taxonomy}
genus = {e[0]: e[3] for e in taxonomy}

specimen_list = sorted(os.listdir('rawcolor_db/images'))
for i, spc in enumerate(specimen_list):
    print '{:03d}/{:03d}: {}'.format(i+1, len(specimen_list), spc)
    session, series = spc.split('_')
    out_fn = os.path.join(OUT_DIR, '{}-meta.json'.format(spc))
    inp_fn = os.path.join(INP_DIR, session, series, '0000000_static_html', 'js', 'database.js')
    
    # Convert database to json
    convert_taffy2json(inp_fn, out_fn)
    
    # Add filenames and labels
    meta = json.load(open(out_fn))
    for m in meta:
        file_bn = os.path.splitext(os.path.basename(m['url']))[0]
        m['filename'] = '{}/{}'.format(spc, file_bn+'_rawcolor.png')
        m['order'] = order[spc]
        m['family'] = family[spc]
        m['genus'] = genus[spc]
        del m['url']
        del m['orientation']
        del m['clipped_fraction']
    
    # Index metadata by filename
    meta = {m['filename']: m for m in meta}
    
    # Filter images in db
    spc_images = [spc + '/' + fn for fn in os.listdir(os.path.join('rawcolor_db', 'images', spc))]
    meta = {fn: meta[fn] for fn in spc_images}
    
    # Rewrite to metadata
    json.dump(meta, open(out_fn, 'w'))

001/132: 20170124_001


IOError: [Errno 2] No such file or directory: 'rawcolor_db/meta/20170124_001-meta.json'

### Create subsets

In [8]:
OUT_DIR = 'rawcolor_db/subsets'
META_DIR = 'rawcolor_db/meta'

specimen_list = sorted(os.listdir('rawcolor_db/images'))
for i, spc in enumerate(specimen_list):
    print '\n{:03d}/{:03d}: {}'.format(i+1, len(specimen_list), spc)
    meta = json.load(open(os.path.join(META_DIR, '{}-meta.json'.format(spc))))
    spc_images = meta.keys()
    unixtimes = [int(''.join(fn.split('/')[-1].split('-')[1:3])) for fn in spc_images]
#     order = np.argsort([float(meta[fn]['timestamp']) for fn in spc_images])
    order = np.argsort(unixtimes)
    
    
    num_images = len(meta)
    train_set = order[:int(num_images * 0.5)]
    valid_set = order[int(num_images * 0.6):int(num_images * 0.7)]
    test_set = order[int(num_images * 0.8):]
    
    open('{}/{}-timeseries.lst'.format(OUT_DIR, spc), 'w').write('\n'.join([spc_images[o] for o in order]))
    open('{}/{}-train.lst'.format(OUT_DIR, spc), 'w').write('\n'.join([spc_images[o] for o in train_set]))
    open('{}/{}-valid.lst'.format(OUT_DIR, spc), 'w').write('\n'.join([spc_images[o] for o in valid_set]))
    open('{}/{}-test.lst'.format(OUT_DIR, spc), 'w').write('\n'.join([spc_images[o] for o in test_set]))
    
    print 'Train set: {}\nValid set: {}\nTest set: {}'.format(len(train_set), len(valid_set), len(test_set))


001/132: 20170124_001
Train set: 631
Valid set: 127
Test set: 253

002/132: 20170124_002
Train set: 501
Valid set: 101
Test set: 201

003/132: 20170126_001
Train set: 277
Valid set: 55
Test set: 111

004/132: 20170126_002
Train set: 499
Valid set: 100
Test set: 200

005/132: 20170126_003
Train set: 500
Valid set: 100
Test set: 201

006/132: 20170130_001
Train set: 475
Valid set: 95
Test set: 191

007/132: 20170130_002
Train set: 420
Valid set: 84
Test set: 168

008/132: 20170130_003
Train set: 497
Valid set: 99
Test set: 199

009/132: 20170131_001
Train set: 501
Valid set: 100
Test set: 201

010/132: 20170131_002
Train set: 329
Valid set: 66
Test set: 132

011/132: 20170203_001
Train set: 460
Valid set: 92
Test set: 184

012/132: 20170203_002
Train set: 527
Valid set: 105
Test set: 211

013/132: 20170206_001
Train set: 502
Valid set: 100
Test set: 201

014/132: 20170206_002
Train set: 569
Valid set: 114
Test set: 228

015/132: 20170207_001
Train set: 526
Valid set: 106
Test set: 211



Train set: 313
Valid set: 63
Test set: 126

129/132: 20171207_003
Train set: 307
Valid set: 61
Test set: 123

130/132: 20171207_004
Train set: 292
Valid set: 58
Test set: 117

131/132: 20171207_005
Train set: 307
Valid set: 61
Test set: 123

132/132: 20171207_006
Train set: 299
Valid set: 60
Test set: 120


### Add turk annotation data

In [3]:
# Load all meta files
meta = {}
specimen_list = sorted(os.listdir('rawcolor_db/images'))
for i, spc in enumerate(specimen_list):
    meta[spc] = json.load(open('rawcolor_db/meta/{}-meta.json'.format(spc)))

# Load turk results
results = list(csv.reader(open('turk_results/Batch_3084800_batch_results.csv')))
header = results[0]
ans_idx = header.index('Answer.annotation_data')
wid_idx = header.index('WorkerId')
worker_ids = [rst[wid_idx] for rst in results[1:]]
results = [rst[ans_idx] for rst in results[1:]]

# Add annotations
specimen_dict = [l.split() for l in open('turk_results/turk_db_correspondance.lst')]
specimen_dict = {spc[0]: spc[1] for spc in specimen_dict}
for it, (wid, rst) in enumerate(zip(worker_ids, results)):
    if it % 100 == 0:
        print it, 'of', len(worker_ids), 'done!'
        
    hit = json.loads(rst)
    for entry in hit:
        # Read image and specimen label
        lbl = entry['url'].split('/')[-2]
        spc = specimen_dict[lbl].replace('/', '_')
        if spc not in specimen_list:
            continue
            
        ann_image = os.path.join(spc, entry['url'].split('/')[-1][:-4]+'_rawcolor.png')
        if not os.path.isfile(os.path.join('rawcolor_db', 'images', ann_image)):
            continue
        img = sio.imread(os.path.join('rawcolor_db', 'images', ann_image))
        scale = float(img.shape[1]) / 200.
        
        # Prepare turk annotations
        annotation = {key: entry[key] for key in ['focus', 'z-dir', 'confidence']}
        annotation['worker_id'] = wid
        for tag in ['head', 'tail']:
            if isinstance(entry[tag], dict):
                annotation[tag] = {coord: val * scale for coord, val in entry[tag].iteritems()}
            else:
                annotation[tag] = entry[tag]
                
        # Add to metadata
        if 'annotation' not in meta[spc][ann_image]:
            meta[spc][ann_image]['annotation'] = []
        meta[spc][ann_image]['annotation'].append(annotation)
        
#         plt.figure()
#         plt.imshow(img)
#         head, tail = annotation['head'], annotation['tail']
#         plt.gca().arrow(tail['x'], tail['y'], (head['x'] - tail['x']), (head['y'] - tail['y']), fc='b', ec='b', head_width=5, head_length=10)
#         sdkvcb

# Save metadata
for spc in meta:
    out_fn = 'rawcolor_db/meta/{}-meta.json'.format(spc)
    json.dump(meta[spc], open(out_fn, 'w'))

IOError: [Errno 2] No such file or directory: 'rawcolor_db/meta/20170124_001-meta.json'