In [6]:
from pathlib import Path
import shutil as sh
import json
import pandas as pd
import numpy as np

models_path = Path('../runs/3-2-abs_mag/models')
clusters_path = Path('../runs/3-2-abs_mag/clusters')
stats_path = Path('../runs/3-2-abs_mag/stats')

if not stats_path.exists():
    stats_path.mkdir()


for model in models_path.iterdir():
    if not model.is_dir():
        continue
    sh.copy(str(model / 'stats.json') , str(stats_path / f'{model.stem}.json') )
    
stats_files = list(stats_path.iterdir())

In [7]:
stats_df = pd.DataFrame()
for stat_f in stats_files:
    print("========= " + stat_f.stem + " ==========")
    
    with open(stat_f, 'r') as f:
        stats = json.load(f)
        
        for k, run in stats.items():
            print(f'--- {k}: {len(run)} ---') 
            
            # print(k, run)
            
            loss = 0
            cluster_sizes = []
            for cluster in run:
                loss += cluster['loss']*cluster['cluster_size']
                cluster_sizes.append(cluster['cluster_size'])
            loss /= sum(cluster_sizes) 
            
            stats_df = pd.concat([stats_df, pd.DataFrame([
                    {'model': stat_f.stem, 
                    'run': k, 
                    'cluster sizes':cluster_sizes, 
                    'nclusters': len(run),
                    'total_length': sum([c['cluster_size'] for c in run]),
                    'avg loss': loss,
                    'min loss' : min([c['loss'] for c in run]),
                    'max loss': max([c['loss'] for c in run]),
                    'std loss': np.std([c['loss'] for c in run]),}
                ])
            ])
                
            print("Average Cluster Loss: ", loss)
stats_df.reset_index(drop=True, inplace=True)
stats_df[['run', 'nclusters']] = stats_df[['run', 'nclusters']].astype(int)
        
    

--- 0: 2 ---
Average Cluster Loss:  0.06211064955299221
--- 1: 3 ---
Average Cluster Loss:  0.05335118319823732
--- 2: 4 ---
Average Cluster Loss:  0.048840094270063895
--- 3: 5 ---
Average Cluster Loss:  0.05214514511255002
--- 4: 6 ---
Average Cluster Loss:  0.04743765119854691
--- 0: 2 ---
Average Cluster Loss:  0.08036030664699484
--- 1: 3 ---
Average Cluster Loss:  0.07621072296905453
--- 2: 4 ---
Average Cluster Loss:  0.0752425584070199
--- 3: 5 ---
Average Cluster Loss:  0.07570854658119316
--- 4: 6 ---
Average Cluster Loss:  0.07598973155938195
--- 5: 7 ---
Average Cluster Loss:  0.07278067289848679
--- 0: 2 ---
Average Cluster Loss:  0.08167376998843925
--- 1: 3 ---
Average Cluster Loss:  0.06797041598381147
--- 0: 2 ---
Average Cluster Loss:  0.0819187843486149
--- 1: 3 ---
Average Cluster Loss:  0.07951744688052449
--- 2: 4 ---
Average Cluster Loss:  0.07608754682227958
--- 3: 5 ---
Average Cluster Loss:  0.07517254348202151
--- 4: 6 ---
Average Cluster Loss:  0.07675449699

# Model Results

In [8]:
stats_df

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
0,pca_kmeans_mag,0,"[6204, 5567]",2,11771,0.062111,0.055451,0.068086,0.006317
1,pca_kmeans_mag,1,"[5618, 3407, 2746]",3,11771,0.053351,0.044837,0.076376,0.014021
2,pca_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.048840,0.039608,0.071897,0.012346
3,pca_kmeans_mag,3,"[2977, 2316, 1628, 3258, 1592]",5,11771,0.052145,0.043489,0.091126,0.018149
4,pca_kmeans_mag,4,"[1555, 1984, 1371, 2814, 1210, 2837]",6,11771,0.047438,0.035457,0.077219,0.013490
...,...,...,...,...,...,...,...,...,...
129,ts_kmeans_alpha,1,"[2364, 7208, 2199]",3,11771,0.080787,0.077426,0.083312,0.002420
130,ts_kmeans_alpha,2,"[2364, 3483, 2199, 3725]",4,11771,0.081936,0.080966,0.084015,0.001180
131,quantile_pca_dbscan,4,"[7168, 4521, 82]",3,11771,0.080849,0.077315,0.111069,0.014801
132,quantile_pca_dbscan,5,"[7183, 4530, 58]",3,11771,0.080963,0.080413,0.117962,0.017482


## Best Models

In [9]:
top10 = list(stats_df['avg loss'].nsmallest(10).index)
stats_df.loc[top10]

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
4,pca_kmeans_mag,4,"[1555, 1984, 1371, 2814, 1210, 2837]",6,11771,0.047438,0.035457,0.077219,0.01349
111,quantile_tsne_kmeans,7,"[1052, 1360, 1468, 1282, 1388, 1514, 1474, 108...",9,11771,0.048017,0.038904,0.076967,0.011158
37,quantile_tsne_agg,5,"[2242, 2027, 1612, 1037, 1504, 1914, 1435]",7,11771,0.048802,0.038029,0.073685,0.011037
92,tsne_kmeans_mag,3,"[2977, 2316, 1628, 3258, 1592]",5,11771,0.048822,0.040434,0.075762,0.013049
2,pca_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.04884,0.039608,0.071897,0.012346
93,tsne_kmeans_mag,4,"[1555, 1984, 1371, 2814, 1210, 2837]",6,11771,0.04907,0.035604,0.087748,0.017211
91,tsne_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.049423,0.040598,0.072607,0.012156
31,tsne_agg_mag,3,"[2466, 3115, 3213, 1485, 1492]",5,11771,0.049901,0.039285,0.078845,0.013916
38,quantile_tsne_agg,6,"[1612, 2027, 1914, 1037, 1504, 1143, 1435, 1099]",8,11771,0.049963,0.035505,0.081437,0.013708
56,minisom_mag,0,"[3443, 2772, 3760, 1796]",4,11771,0.050588,0.041635,0.08041,0.015499
