# Repackager

Create a package from a list of good files instead of a directory

In [19]:
from nbmod import nbloader
from Packager import package_df
from Clusterer import add_cluster_column
import pandas as pd
import os

In [2]:
CSV_URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSgGCk9pdnXBQoIWNJTLwRcjScrFqPJTWujekS9JXrUsqc3VKpIkkFyVkJrc4B2riy68_YzOIT-yQcY/pub?gid=0&single=true&output=csv"
IMAGE_DIR = "./sample-images/cropped"

In [3]:
def get_df():
    return pd.read_csv(CSV_URL)

get_df().head()

Unnamed: 0,folder_name,cluster_id,jk_good,pl_good,cd_good
0,2012m1_ref134_ex1,0,,,
1,2012m1_ref134_ex1,1,,,
2,2012m1_ref134_ex1,2,,,
3,2012m1_ref134_ex1,3,,,
4,2012m1_ref134_ex1,4,,,


In [4]:
def get_good_df(initials):
    df = get_df()
    return df[df[f"{initials.lower().strip()}_good"] == True]
    
get_good_df("jk").head()

Unnamed: 0,folder_name,cluster_id,jk_good,pl_good,cd_good
6,2012m1_ref134_ex1,6,True,True,True
21,2012m1_ref195_426,5,True,True,True
40,2012m1_ref199_241,8,True,True,
45,2012m1_ref199_241,13,True,True,True
55,2012m1_ref206_osx,7,True,True,True


In [5]:
class Package:
    
    def __init__(self, pkg_dir):
        self.pkg_dir = pkg_dir
        
    def clusters(self):
        return pd.read_csv(os.path.join(self.pkg_dir, "clusters.csv"))
    
    def cluster(self, cluster_id):
        ret = self.clusters()
        return ret[ret["cluster"].apply(lambda x: str(x)) == str(cluster_id)]
    


In [6]:
Package("./out/packages-firstpass/2012m1_ref134_ex1/").clusters()

Unnamed: 0,localpath,cluster
0,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
1,/Volumes/EveryPixel/2020 Data/output/cropped/2...,8
2,/Volumes/EveryPixel/2020 Data/output/cropped/2...,3
3,/Volumes/EveryPixel/2020 Data/output/cropped/2...,12
4,/Volumes/EveryPixel/2020 Data/output/cropped/2...,3
...,...,...
2582,/Volumes/EveryPixel/2020 Data/output/cropped/2...,0
2583,/Volumes/EveryPixel/2020 Data/output/cropped/2...,13
2584,/Volumes/EveryPixel/2020 Data/output/cropped/2...,13
2585,/Volumes/EveryPixel/2020 Data/output/cropped/2...,5


In [7]:
Package("./out/packages-firstpass/2012m1_ref134_ex1/").cluster(4)

Unnamed: 0,localpath,cluster
0,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
11,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
16,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
20,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
38,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
...,...,...
2466,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
2468,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
2502,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4
2504,/Volumes/EveryPixel/2020 Data/output/cropped/2...,4


In [8]:
def get_mega_cluster(initials):
    good_df = get_good_df(initials)
    good_df["cluster"] = "mega"
    return good_df
get_mega_cluster("cd")

Unnamed: 0,folder_name,cluster_id,jk_good,pl_good,cd_good,cluster
6,2012m1_ref134_ex1,6,True,True,True,mega
21,2012m1_ref195_426,5,True,True,True,mega
45,2012m1_ref199_241,13,True,True,True,mega
55,2012m1_ref206_osx,7,True,True,True,mega
77,2012m1_ref213_2yq,13,True,True,True,mega
87,2012m1_ref226_9eb,7,True,True,True,mega
103,2012m1_ref32_4a7,7,,True,True,mega
112,2012m1_ref37_ckr,0,,True,True,mega
137,2012m1_ref41_v3r,9,True,True,True,mega
138,2012m1_ref41_v3r,10,True,True,True,mega


In [25]:
def package_good(initials, dst_dir):
    
    """
        Re-cluster a user's selected 'good' images, and then re-cluster
        everything as one big set
    """
    
    dst_dir = os.path.join(dst_dir, initials)
    try:
        os.makedirs(dst_dir)
    except Exception as e:
        #print(f"Error: {e}")
        pass
    
    good_df = get_good_df(initials)
    mega_df = pd.DataFrame()
    row_sum = 0
    for i, row in good_df.iterrows():
        row_df = Package(f"./out/packages-firstpass/{row['folder_name']}/").cluster(row["cluster_id"])
        row_sum += len(row_df)

        print(f"Reclustering {row_sum} images in folder:{row['folder_name']} cluster:{row['cluster_id']}")
        row_df = add_cluster_column(row_df)
        row_dst = os.path.join(
                os.path.join(dst_dir, row['folder_name']),
                str(row["cluster_id"])
            )
        package_df(row_df, row_dst)
        
        row_df.to_csv(os.path.join(row_dst, "clusters.csv"), index=False)
        
        if mega_df.empty:
            mega_df = row_df.copy()
        else:
            mega_df = pd.concat([mega_df, row_df])
    
    print (row_sum)
    print (len(mega_df))
    mega_df["cluster"] = f"all" 
    
    ret = pd.concat([mega_df, mega_df2])
    
    package_df(ret, os.path.join("mega", dst_dir))
    
    return ret

package_good("cd", "./out/packages-grading/1")


Reclustering 215 images in folder:2012m1_ref134_ex1 cluster:6
5% 10% 15% 20% 25% 

KeyboardInterrupt: 