In [5]:
from pathlib import Path
import shutil as sh
import json
import pandas as pd
import numpy as np

models_path = Path('../runs/3-2-abs_mag/models')
clusters_path = Path('../runs/3-2-abs_mag/clusters')
stats_path = Path('../runs/3-2-abs_mag/stats')

if not stats_path.exists():
    stats_path.mkdir()


for model in models_path.iterdir():
    if not model.is_dir():
        continue
    sh.copy(str(model / 'stats.json') , str(stats_path / f'{model.stem}.json') )
    
stats_files = list(stats_path.iterdir())

In [7]:
stats_df = pd.DataFrame()
for stat_f in stats_files:
    print("========= " + stat_f.stem + " ==========")
    
    with open(stat_f, 'r') as f:
        stats = json.load(f)
        
        for k, run in stats.items():
            print(f'--- {k}: {len(run)} ---') 
            
            # print(k, run)
            
            loss = 0
            cluster_sizes = []
            for cluster in run:
                loss += cluster['loss']*cluster['cluster_size']
                cluster_sizes.append(cluster['cluster_size'])
            loss /= sum(cluster_sizes) 
            
            stats_df = pd.concat([stats_df, pd.DataFrame([
                    {'model': stat_f.stem, 
                    'run': k, 
                    'cluster sizes':cluster_sizes, 
                    'nclusters': len(run),
                    'total_length': sum([c['cluster_size'] for c in run]),
                    'avg loss': loss,
                    'min loss' : min([c['loss'] for c in run]),
                    'max loss': max([c['loss'] for c in run]),
                    'std loss': np.std([c['loss'] for c in run]),}
                ])
            ])
                
            print("Average Cluster Loss: ", loss)
stats_df.reset_index(drop=True, inplace=True)
stats_df[['run', 'nclusters']] = stats_df[['run', 'nclusters']].astype(int)
        
    

--- 0: 2 ---
Average Cluster Loss:  0.08036030664699484
--- 1: 3 ---
Average Cluster Loss:  0.07621072296905453
--- 2: 4 ---
Average Cluster Loss:  0.0752425584070199
--- 3: 5 ---
Average Cluster Loss:  0.07570854658119316
--- 4: 6 ---
Average Cluster Loss:  0.07598973155938195
--- 5: 7 ---
Average Cluster Loss:  0.07278067289848679
--- 0: 2 ---
Average Cluster Loss:  0.0819187843486149
--- 1: 3 ---
Average Cluster Loss:  0.07951744688052449
--- 2: 4 ---
Average Cluster Loss:  0.07608754682227958
--- 3: 5 ---
Average Cluster Loss:  0.07517254348202151
--- 4: 6 ---
Average Cluster Loss:  0.07675449699087482
--- 0: 2 ---
Average Cluster Loss:  0.08039916173938927
--- 1: 3 ---
Average Cluster Loss:  0.07907135318323848
--- 2: 4 ---
Average Cluster Loss:  0.07821648505188555
--- 3: 5 ---
Average Cluster Loss:  0.07412533310928206
--- 4: 6 ---
Average Cluster Loss:  0.07409707875467382
--- 5: 7 ---
Average Cluster Loss:  0.07329641236415604
--- 6: 8 ---
Average Cluster Loss:  0.070504276301

# Model Results

In [8]:
stats_df

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
0,minmax_pca_kmeans,0,"[4571, 7200]",2,11771,0.080360,0.079669,0.081449,0.000890
1,minmax_pca_kmeans,1,"[3375, 4677, 3719]",3,11771,0.076211,0.067933,0.080110,0.005515
2,minmax_pca_kmeans,2,"[2525, 3325, 2057, 3864]",4,11771,0.075243,0.069361,0.080161,0.004306
3,minmax_pca_kmeans,3,"[1576, 3063, 1407, 3734, 1991]",5,11771,0.075709,0.049793,0.084557,0.012124
4,minmax_pca_kmeans,4,"[1648, 2931, 1200, 2416, 1737, 1839]",6,11771,0.075990,0.070550,0.084169,0.004354
...,...,...,...,...,...,...,...,...,...
100,maxabs_pca_kmeans,4,"[1648, 2932, 1200, 2414, 1737, 1840]",6,11771,0.074321,0.064444,0.083128,0.006620
101,maxabs_pca_kmeans,5,"[1643, 1252, 1199, 2187, 1736, 1696, 2058]",7,11771,0.072506,0.059177,0.076952,0.005702
102,ts_kmeans_alpha,0,"[4563, 7208]",2,11771,0.081275,0.081059,0.081411,0.000176
103,ts_kmeans_alpha,1,"[2364, 7208, 2199]",3,11771,0.080787,0.077426,0.083312,0.002420


## Best Models

In [9]:
top10 = list(stats_df['avg loss'].nsmallest(10).index)
stats_df.loc[top10]

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
74,tsne_kmeans_mag,3,"[2977, 2316, 1628, 3258, 1592]",5,11771,0.048822,0.040434,0.075762,0.013049
75,tsne_kmeans_mag,4,"[1555, 1984, 1371, 2814, 1210, 2837]",6,11771,0.04907,0.035604,0.087748,0.017211
73,tsne_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.049423,0.040598,0.072607,0.012156
24,tsne_agg_mag,3,"[2466, 3115, 3213, 1485, 1492]",5,11771,0.049901,0.039285,0.078845,0.013916
42,minisom_mag,0,"[3443, 2772, 3760, 1796]",4,11771,0.050588,0.041635,0.08041,0.015499
88,ts_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.050836,0.040128,0.074533,0.012815
89,ts_kmeans_mag,3,"[2977, 2316, 1628, 3258, 1592]",5,11771,0.05088,0.041651,0.08448,0.015965
23,tsne_agg_mag,2,"[4705, 3115, 2466, 1485]",4,11771,0.050963,0.042378,0.085682,0.017438
90,ts_kmeans_mag,4,"[1557, 1984, 1371, 2812, 1210, 2837]",6,11771,0.051384,0.036011,0.098556,0.020563
57,pca_agg_mag,3,"[2466, 3115, 3213, 1485, 1492]",5,11771,0.051918,0.038826,0.092031,0.019024


In [11]:
stats_df.loc[top10]

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
74,tsne_kmeans_mag,3,"[2977, 2316, 1628, 3258, 1592]",5,11771,0.048822,0.040434,0.075762,0.013049
75,tsne_kmeans_mag,4,"[1555, 1984, 1371, 2814, 1210, 2837]",6,11771,0.04907,0.035604,0.087748,0.017211
73,tsne_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.049423,0.040598,0.072607,0.012156
24,tsne_agg_mag,3,"[2466, 3115, 3213, 1485, 1492]",5,11771,0.049901,0.039285,0.078845,0.013916
42,minisom_mag,0,"[3443, 2772, 3760, 1796]",4,11771,0.050588,0.041635,0.08041,0.015499
88,ts_kmeans_mag,2,"[3672, 2473, 1845, 3781]",4,11771,0.050836,0.040128,0.074533,0.012815
89,ts_kmeans_mag,3,"[2977, 2316, 1628, 3258, 1592]",5,11771,0.05088,0.041651,0.08448,0.015965
23,tsne_agg_mag,2,"[4705, 3115, 2466, 1485]",4,11771,0.050963,0.042378,0.085682,0.017438
90,ts_kmeans_mag,4,"[1557, 1984, 1371, 2812, 1210, 2837]",6,11771,0.051384,0.036011,0.098556,0.020563
57,pca_agg_mag,3,"[2466, 3115, 3213, 1485, 1492]",5,11771,0.051918,0.038826,0.092031,0.019024


# Global MSE

In [69]:
from keras.models import load_model
from pathlib import Path
import pandas as pd
from pickle import load
import gc

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import QuantileTransformer

data_path = Path('../../data/compiled/')


mse_dict = {'model':[], 'run':[], 'mse':[], 'eval_mse':[]}

## Helper Functions

In [70]:
def load_original_data(data_path: Path, save_scalers : bool = False):
    """Load the original data from the file."""

    inputs = pd.read_csv(data_path / 'inputsdata_compilation.csv')
    outputs = pd.read_csv(data_path / 'outputsdata_compilation.csv')
    
    input_filenames = inputs[['filename']]
    output_filenames = outputs[['filename']]
    
    scaler_inputs, scaler_ouputs = QuantileTransformer(), QuantileTransformer()
    inputs = scaler_inputs.fit_transform(inputs.iloc[:, 1:])
    outputs = scaler_ouputs.fit_transform(outputs.iloc[:, 1:])
    
    inputs = pd.DataFrame(inputs)
    inputs = pd.concat([input_filenames, inputs], axis=1)
    
    outputs = pd.DataFrame(outputs)
    outputs = pd.concat([output_filenames, outputs], axis=1)
    
    print("Scaled inputs:", inputs.head())
    print("Scaled outputs:", outputs.head())
    return inputs, outputs, scaler_inputs, scaler_ouputs


def join_files_in_cluster(cluster_files, input_data : pd.DataFrame, output_data : pd.DataFrame):
    """Join all files in a cluster into a single dataframe."""
    cluster_inputs, cluster_outputs = pd.DataFrame(), pd.DataFrame()
    
    inputs = [input_data.loc[input_data['filename'] == f].iloc[:, 1:]
              for f in cluster_files]
    
    cluster_inputs = pd.concat(inputs, axis=0, ignore_index=True)
    
    outputs = [output_data.loc[output_data['filename'] == f].iloc[:, 1:]
               for f in cluster_files]
    cluster_outputs = pd.concat(outputs, axis=0, ignore_index=True)      
    
    # print(cluster_inputs.head())
    # print(cluster_inputs.shape)
    # print(cluster_df)
    # print(cluster_df.shape)
    # print(cluster_df.columns)
    print("Cluster shape:", cluster_inputs.shape)
    return cluster_inputs, cluster_outputs


## MSE calculation

In [71]:
stats_df = stats_df.loc[(stats_df['model'].str.contains('quantile_pca_kmeans')) & (stats_df['run'] == 6)]

In [72]:
stats_df

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
72,quantile_pca_kmeans,6,"[1845, 1927, 1309, 1619, 1298, 1493, 1099, 1181]",8,11771,0.04903,0.040163,0.076036,0.010802


In [73]:
original_in, original_out, _scaler_in, scaler_out = load_original_data(data_path)

model_kfold_stats = {}
for _, row in stats_df.iterrows():
    
    with open(clusters_path / f"{row['model']}.pkl", 'rb') as f:
        conf = load(f)
        
    conf = {c['run_id'] : c for c in conf}
        
    run = row['run']
    clusters = conf[run]['clusters']
    kfold_stats = []
    kfold = KFold(n_splits=5, shuffle=True)
    try:
        
        for cluster_id, cluster in clusters.items():
            model_file = f"{conf[run]['method']}_run{conf[run]['run_id']}_{cluster_id}.h5"
            model = load_model(models_path / f"{conf[run]['method']}" / model_file)
            
            inputs, outputs = join_files_in_cluster(cluster, original_in, original_out)
            
            
            test_mses = []
            train_mses = []
            for train, test in kfold.split(inputs, outputs):
                test_predictions = model.predict(inputs.iloc[test])
                test_mse = mean_squared_error(outputs.iloc[test], test_predictions)
                test_mses.append(test_mse)
                
                train_predictions = model.predict(inputs.iloc[train])
                train_mse = mean_squared_error(outputs.iloc[train], train_predictions)
                train_mses.append(train_mse)
            
            stats = {
                'model': conf[run]['method'],
                'cluster' : cluster_id,
                'cluster size': len(cluster),
                'avg test mse': np.mean(test_mses),
                'min test mse': min(test_mses),
                'max test mse': max(test_mses),
                'std test mse': np.std(test_mses),
                'avg train mse': np.mean(train_mses),
                'min train mse': min(train_mses),
                'max train mse': max(train_mses),
                'std train mse': np.std(train_mses),
            }
            print(stats)
            kfold_stats.append(stats)
            # print(scores)
            # predictions = model.predict(inputs)
            # print(predictions.shape)
            
            # cluster_filenames.extend(cluster)
            # all_predictions.append(pd.DataFrame(predictions))
            # all_outs.append(outputs)
    except Exception as e:
        print(e)
        continue
    print(kfold_stats)
    model_kfold_stats[conf[run]['method']] = kfold_stats
    # all_predictions = pd.concat(all_predictions, ignore_index=True)

    # mse = mean_squared_error(pd.concat(all_outs, ignore_index=True), all_predictions)
    # mse_dict['model'].append(conf[run]['method'])
    # mse_dict['run'].append(conf[run]['run_id'])
    # mse_dict['mse'].append(mse)
    # mse_dict['eval_mse'].append(mse)
    
    gc.collect()

Scaled inputs:                        filename    0         1         2         3         4  \
0  profile_wso_CR1992_line_0070  0.0  0.505515  0.504343  0.501124  0.501225   
1  profile_wso_CR1992_line_0073  0.0  0.624823  0.625027  0.623225  0.625704   
2  profile_wso_CR1992_line_0075  0.0  0.732766  0.732816  0.732778  0.735636   
3  profile_wso_CR1992_line_0077  0.0  0.392087  0.387620  0.386084  0.388200   
4  profile_wso_CR1992_line_0078  0.0  0.636709  0.638017  0.636218  0.639128   

          5         6         7         8  ...  1910  1911  1912  1913  1914  \
0  0.500199  0.498023  0.497609  0.497824  ...   0.0   0.0   0.0   0.0   0.0   
1  0.623721  0.622938  0.620052  0.623504  ...   0.0   0.0   0.0   0.0   0.0   
2  0.733143  0.732958  0.733352  0.736522  ...   0.0   0.0   0.0   0.0   0.0   
3  0.387328  0.385791  0.385755  0.386377  ...   0.0   0.0   0.0   0.0   0.0   
4  0.637261  0.636802  0.634617  0.639052  ...   0.0   0.0   0.0   0.0   0.0   

   1915  1916  1917  19

In [74]:
model_kfold_stats

{'quantile_pca_kmeans': [{'model': 'quantile_pca_kmeans',
   'cluster': 0,
   'cluster size': 1845,
   'avg test mse': 0.039125555322466046,
   'min test mse': 0.03669938151661554,
   'max test mse': 0.04256738806334331,
   'std test mse': 0.0025964461362936717,
   'avg train mse': 0.03912555532222762,
   'min train mse': 0.03826509713666593,
   'max train mse': 0.03973209877367365,
   'std train mse': 0.0006491115339670761},
  {'model': 'quantile_pca_kmeans',
   'cluster': 1,
   'cluster size': 1927,
   'avg test mse': 0.04684330144864949,
   'min test mse': 0.045478440763133794,
   'max test mse': 0.048205211209200065,
   'std test mse': 0.001028734768655294,
   'avg train mse': 0.046842267572636834,
   'min train mse': 0.04650188740866388,
   'max train mse': 0.047183801797254316,
   'std train mse': 0.00025723469670014355},
  {'model': 'quantile_pca_kmeans',
   'cluster': 2,
   'cluster size': 1309,
   'avg test mse': 0.04659617419136196,
   'min test mse': 0.043749750784397405,
  