# Packager

Given a root directory with a number of image directories, create a package for each image directory.

A package contains:

    1. A CSV defining the cluster assignments for each image
    2. A number of PNG images, one for each cluster
    3. A JSON file "form" to fill out, which user can manually set good clusters to "true"

In [1]:
import os
import pandas as pd
import json

from nbmod import nbloader
from Clusterer import cluster_directory

import sys
sys.path.append("./utils/")
from ivpy import montage

from multiprocessing import Pool, cpu_count

Using TensorFlow backend.


In [2]:
def package_cluster(df, cluster_id, dst_dir):
    cluster_df = df[df["cluster"] == cluster_id]
    dst_fname = f"cluster-{cluster_id}.png"
    dst_fpath = os.path.join(dst_dir, dst_fname)
    if os.path.exists(dst_fpath):
        return

    print(f"Packaging cluster {cluster_id} => {dst_fpath}")

    fig = montage(pathcol=cluster_df["localpath"], thumb=60)
    fig.save(dst_fpath)


In [1]:
def package_df(df, dst_dir):
    
    form_json = {}
    
    def add_cluster(cluster_id):
        form_json[str(cluster_id)] = False
        
    try: 
        os.makedirs(dst_dir)
    except:
        pass
        
    clusters = df["cluster"].unique()
    
    for cluster_id in clusters:
        add_cluster(cluster_id)
        package_cluster(df, cluster_id, dst_dir)
        
    # write form_json file
    open(os.path.join(dst_dir, "good.json"), "w").write(json.dumps(form_json, indent=2))
    
    return form_json

In [4]:
def package_directory(src_dir, dest_dir):
    
    pkg_id = os.path.basename(src_dir)
    pkg_dest = os.path.join(dest_dir, pkg_id)
        
    # Write the CSV
    csv_dest = os.path.join(pkg_dest, "clusters.csv")
    
    try:
        os.makedirs(pkg_dest)
    except:
        pass
    
    
    if not os.path.exists(csv_dest):
        print(f"Started clustering {src_dir}")
        cluster_directory(src_dir).to_csv(csv_dest, index=False)
        print(f"Created {csv_dest}")
    df = pd.read_csv(csv_dest)
    
    # Create cluster images
    package_df(df, pkg_dest)
    
    return df

In [5]:
# #NBMODULE_IGNORE
# df = package_directory("/Volumes/EveryPixel/2020 Data/output/cropped/2012m1_ref32_4a7", "./out/packages/")

In [6]:
def process_subdir(tup):
    root_dir, img_dir = tup
    sub_dir = os.path.join(root_dir, img_dir)
    if not os.path.isdir(sub_dir):
        return
    if not img_dir.startswith("2012"):
        return
    print(f"Processing image directory: {img_dir}")
    package_directory(sub_dir, "./out/packages")


In [9]:

def process_all(root_dir, dest_root, multicore=False):

    if multicore:
        p_count = int(max(1, cpu_count() / 2))
        print(f"Running on {p_count} cores")
        with Pool(p_count) as p:
            p.map(process_subdir, [(root_dir, x) for x in os.listdir(root_dir)])
    
    else:
        for x in os.listdir(root_dir):
            process_subdir((root_dir, x))



In [None]:
#NBMODULE_IGNORE   
process_all("/Volumes/EveryPixel/2020 Data/output/cropped/", "./out/packages")

Processing image directory: 2012m1_ref32_4a7
Processing image directory: 2012m1_ref37_ckr
Processing image directory: 2012m1_ref67_835
Processing image directory: 2012m1_ref94_yy5
Processing image directory: 2012m1_ref213_2yq
Processing image directory: 2012m1_ref41_v3r
Processing image directory: 2012m1_ref199_241
Processing image directory: 2012m1_ref79_8u9
Processing image directory: 2012m1_ref134_ex1
Processing image directory: 2012m1_ref206_osx
Processing image directory: 2012m1_ref226_9eb
Processing image directory: 2012m1_ref71_2b5
Processing image directory: 2012m1_ref195_426
Processing image directory: 2012m1_ref86_3se
Processing image directory: 2012m2_ref33_oy0
Processing image directory: 2012m2_ref64_3o1
Processing image directory: 2012m2_ref85_896
Processing image directory: 2012m2_ref116_a77
Processing image directory: 2012m2_ref117_vbm
Processing image directory: 2012m2_ref119_h8b
Processing image directory: 2012m2_ref120_2xq
Processing image directory: 2012m2_ref140_15f