In [65]:
from pathlib import Path
import shutil as sh
import json
import pandas as pd
import numpy as np

models_path = Path('../runs/2-18-shuffle/models')
clusters_path = Path('../runs/2-18-shuffle/clusters')
stats_path = Path('../runs/2-18-shuffle/stats')

if not stats_path.exists():
    stats_path.mkdir()


for model in models_path.iterdir():
    if not model.is_dir():
        continue
    sh.copy(str(model / 'stats.json') , str(stats_path / f'{model.stem}.json') )
    
stats_files = list(stats_path.iterdir())

In [66]:
stats_df = pd.DataFrame()
for stat_f in stats_files:
    print("========= " + stat_f.stem + " ==========")
    
    with open(stat_f, 'r') as f:
        stats = json.load(f)
        
        for k, run in stats.items():
            print(f'--- {k}: {len(run)} ---') 
            
            print(k, run)
            
            loss = 0
            cluster_sizes = []
            for cluster in run:
                loss += cluster['loss']*cluster['cluster_size']
                cluster_sizes.append(cluster['cluster_size'])
            loss /= sum(cluster_sizes) 
            
            stats_df = pd.concat([stats_df, pd.DataFrame([
                    {'model': stat_f.stem, 
                    'run': k, 
                    'cluster sizes':cluster_sizes, 
                    'nclusters': len(run),
                    'total_length': sum([c['cluster_size'] for c in run]),
                    'avg loss': loss,
                    'min loss' : min([c['loss'] for c in run]),
                    'max loss': max([c['loss'] for c in run]),
                    'std loss': np.std([c['loss'] for c in run]),}
                ])
            ])
                
            print("Average Cluster Loss: ", loss)
stats_df.reset_index(drop=True, inplace=True)
stats_df[['run', 'nclusters']] = stats_df[['run', 'nclusters']].astype(int)
        
    

--- 0: 4 ---
0 [{'run': 0, 'cluster': 4, 'loss': 0.07717248797416687, 'mse': 0.07717248797416687, 'cluster_size': 1723}, {'run': 0, 'cluster': 3, 'loss': 0.07843030244112015, 'mse': 0.07843030244112015, 'cluster_size': 2868}, {'run': 0, 'cluster': 1, 'loss': 0.07971988618373871, 'mse': 0.07971988618373871, 'cluster_size': 4563}, {'run': 0, 'cluster': 2, 'loss': 0.07799237221479416, 'mse': 0.07799237221479416, 'cluster_size': 2617}]
Average Cluster Loss:  0.07864872847873063
--- 2: 3 ---
2 [{'run': 2, 'cluster': 0, 'loss': 0.08187959343194962, 'mse': 0.08187959343194962, 'cluster_size': 5840}, {'run': 2, 'cluster': 1, 'loss': 0.0784979909658432, 'mse': 0.0784979909658432, 'cluster_size': 4501}, {'run': 2, 'cluster': 2, 'loss': 0.04726994037628174, 'mse': 0.04726994037628174, 'cluster_size': 1283}]
Average Cluster Loss:  0.07675013906423052
--- 3: 3 ---
3 [{'run': 3, 'cluster': 0, 'loss': 0.08079037815332413, 'mse': 0.08079037815332413, 'cluster_size': 5854}, {'run': 3, 'cluster': 1, 'lo

# Model Results

In [67]:
stats_df

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
0,minisom_alpha,0,"[1723, 2868, 4563, 2617]",4,11771,0.078649,0.077172,0.079720,0.000921
1,quantile_pca_dbscan,2,"[5840, 4501, 1283]",3,11624,0.076750,0.047270,0.081880,0.015579
2,quantile_pca_dbscan,3,"[5854, 4529, 1295]",3,11678,0.077811,0.045706,0.083139,0.017119
3,quantile_pca_dbscan,4,"[5873, 4533, 1295]",3,11701,0.077935,0.050936,0.082233,0.014378
4,quantile_pca_dbscan,5,"[5881, 4543, 1305]",3,11729,0.077511,0.048626,0.081180,0.015317
...,...,...,...,...,...,...,...,...,...
84,pca_kmeans_mag,4,"[3072, 2399, 1729, 1733, 1071, 1767]",6,11771,0.050225,0.043077,0.074703,0.011178
85,quantile_pca_agg,0,"[7208, 4563]",2,11771,0.081258,0.079574,0.082324,0.001375
86,quantile_pca_agg,1,"[5899, 4563, 1309]",3,11771,0.076349,0.045355,0.081182,0.016397
87,quantile_pca_agg,2,"[4563, 3013, 1309, 2886]",4,11771,0.067507,0.047817,0.079750,0.012573


## Best Models

In [68]:
top10 = list(stats_df['avg loss'].nsmallest(10).index)
stats_df.loc[top10]

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
72,quantile_pca_kmeans,6,"[1845, 1927, 1309, 1619, 1298, 1493, 1099, 1181]",8,11771,0.04903,0.040163,0.076036,0.010802
18,ts_kmeans_mag,4,"[3072, 2397, 1729, 1733, 1071, 1769]",6,11771,0.050221,0.041855,0.071897,0.010366
84,pca_kmeans_mag,4,"[3072, 2399, 1729, 1733, 1071, 1767]",6,11771,0.050225,0.043077,0.074703,0.011178
71,quantile_pca_kmeans,5,"[1845, 2609, 1309, 1619, 1430, 1860, 1099]",7,11771,0.050742,0.03469,0.068599,0.009666
83,pca_kmeans_mag,3,"[3069, 3236, 1732, 2663, 1071]",5,11771,0.052838,0.042964,0.062948,0.006371
88,quantile_pca_agg,3,"[3325, 3013, 1309, 2886, 1238]",5,11771,0.053814,0.044633,0.060799,0.005159
17,ts_kmeans_mag,3,"[3070, 3236, 1731, 2663, 1071]",5,11771,0.054086,0.041548,0.068264,0.008606
70,quantile_pca_kmeans,4,"[2123, 2631, 1309, 2440, 1460, 1808]",6,11771,0.056196,0.044667,0.071563,0.009166
69,quantile_pca_kmeans,3,"[2123, 2737, 1309, 2440, 3162]",5,11771,0.0589,0.048133,0.071073,0.008423
13,robust_pca_kmeans,2,[2675],1,2675,0.060996,0.060996,0.060996,0.0


# Global MSE

In [69]:
from keras.models import load_model
from pathlib import Path
import pandas as pd
from pickle import load
import gc

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import QuantileTransformer

data_path = Path('../../data/compiled/')


mse_dict = {'model':[], 'run':[], 'mse':[], 'eval_mse':[]}

## Helper Functions

In [70]:
def load_original_data(data_path: Path, save_scalers : bool = False):
    """Load the original data from the file."""

    inputs = pd.read_csv(data_path / 'inputsdata_compilation.csv')
    outputs = pd.read_csv(data_path / 'outputsdata_compilation.csv')
    
    input_filenames = inputs[['filename']]
    output_filenames = outputs[['filename']]
    
    scaler_inputs, scaler_ouputs = QuantileTransformer(), QuantileTransformer()
    inputs = scaler_inputs.fit_transform(inputs.iloc[:, 1:])
    outputs = scaler_ouputs.fit_transform(outputs.iloc[:, 1:])
    
    inputs = pd.DataFrame(inputs)
    inputs = pd.concat([input_filenames, inputs], axis=1)
    
    outputs = pd.DataFrame(outputs)
    outputs = pd.concat([output_filenames, outputs], axis=1)
    
    print("Scaled inputs:", inputs.head())
    print("Scaled outputs:", outputs.head())
    return inputs, outputs, scaler_inputs, scaler_ouputs


def join_files_in_cluster(cluster_files, input_data : pd.DataFrame, output_data : pd.DataFrame):
    """Join all files in a cluster into a single dataframe."""
    cluster_inputs, cluster_outputs = pd.DataFrame(), pd.DataFrame()
    
    inputs = [input_data.loc[input_data['filename'] == f].iloc[:, 1:]
              for f in cluster_files]
    
    cluster_inputs = pd.concat(inputs, axis=0, ignore_index=True)
    
    outputs = [output_data.loc[output_data['filename'] == f].iloc[:, 1:]
               for f in cluster_files]
    cluster_outputs = pd.concat(outputs, axis=0, ignore_index=True)      
    
    # print(cluster_inputs.head())
    # print(cluster_inputs.shape)
    # print(cluster_df)
    # print(cluster_df.shape)
    # print(cluster_df.columns)
    print("Cluster shape:", cluster_inputs.shape)
    return cluster_inputs, cluster_outputs


## MSE calculation

In [71]:
stats_df = stats_df.loc[(stats_df['model'].str.contains('quantile_pca_kmeans')) & (stats_df['run'] == 6)]

In [72]:
stats_df

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
72,quantile_pca_kmeans,6,"[1845, 1927, 1309, 1619, 1298, 1493, 1099, 1181]",8,11771,0.04903,0.040163,0.076036,0.010802


In [73]:
original_in, original_out, _scaler_in, scaler_out = load_original_data(data_path)

model_kfold_stats = {}
for _, row in stats_df.iterrows():
    
    with open(clusters_path / f"{row['model']}.pkl", 'rb') as f:
        conf = load(f)
        
    conf = {c['run_id'] : c for c in conf}
        
    run = row['run']
    clusters = conf[run]['clusters']
    kfold_stats = []
    kfold = KFold(n_splits=5, shuffle=True)
    try:
        
        for cluster_id, cluster in clusters.items():
            model_file = f"{conf[run]['method']}_run{conf[run]['run_id']}_{cluster_id}.h5"
            model = load_model(models_path / f"{conf[run]['method']}" / model_file)
            
            inputs, outputs = join_files_in_cluster(cluster, original_in, original_out)
            
            
            test_mses = []
            train_mses = []
            for train, test in kfold.split(inputs, outputs):
                test_predictions = model.predict(inputs.iloc[test])
                test_mse = mean_squared_error(outputs.iloc[test], test_predictions)
                test_mses.append(test_mse)
                
                train_predictions = model.predict(inputs.iloc[train])
                train_mse = mean_squared_error(outputs.iloc[train], train_predictions)
                train_mses.append(train_mse)
            
            stats = {
                'model': conf[run]['method'],
                'cluster' : cluster_id,
                'cluster size': len(cluster),
                'avg test mse': np.mean(test_mses),
                'min test mse': min(test_mses),
                'max test mse': max(test_mses),
                'std test mse': np.std(test_mses),
                'avg train mse': np.mean(train_mses),
                'min train mse': min(train_mses),
                'max train mse': max(train_mses),
                'std train mse': np.std(train_mses),
            }
            print(stats)
            kfold_stats.append(stats)
            # print(scores)
            # predictions = model.predict(inputs)
            # print(predictions.shape)
            
            # cluster_filenames.extend(cluster)
            # all_predictions.append(pd.DataFrame(predictions))
            # all_outs.append(outputs)
    except Exception as e:
        print(e)
        continue
    print(kfold_stats)
    model_kfold_stats[conf[run]['method']] = kfold_stats
    # all_predictions = pd.concat(all_predictions, ignore_index=True)

    # mse = mean_squared_error(pd.concat(all_outs, ignore_index=True), all_predictions)
    # mse_dict['model'].append(conf[run]['method'])
    # mse_dict['run'].append(conf[run]['run_id'])
    # mse_dict['mse'].append(mse)
    # mse_dict['eval_mse'].append(mse)
    
    gc.collect()

Scaled inputs:                        filename    0         1         2         3         4  \
0  profile_wso_CR1992_line_0070  0.0  0.505515  0.504343  0.501124  0.501225   
1  profile_wso_CR1992_line_0073  0.0  0.624823  0.625027  0.623225  0.625704   
2  profile_wso_CR1992_line_0075  0.0  0.732766  0.732816  0.732778  0.735636   
3  profile_wso_CR1992_line_0077  0.0  0.392087  0.387620  0.386084  0.388200   
4  profile_wso_CR1992_line_0078  0.0  0.636709  0.638017  0.636218  0.639128   

          5         6         7         8  ...  1910  1911  1912  1913  1914  \
0  0.500199  0.498023  0.497609  0.497824  ...   0.0   0.0   0.0   0.0   0.0   
1  0.623721  0.622938  0.620052  0.623504  ...   0.0   0.0   0.0   0.0   0.0   
2  0.733143  0.732958  0.733352  0.736522  ...   0.0   0.0   0.0   0.0   0.0   
3  0.387328  0.385791  0.385755  0.386377  ...   0.0   0.0   0.0   0.0   0.0   
4  0.637261  0.636802  0.634617  0.639052  ...   0.0   0.0   0.0   0.0   0.0   

   1915  1916  1917  19

In [74]:
model_kfold_stats

{'quantile_pca_kmeans': [{'model': 'quantile_pca_kmeans',
   'cluster': 0,
   'cluster size': 1845,
   'avg test mse': 0.039125555322466046,
   'min test mse': 0.03669938151661554,
   'max test mse': 0.04256738806334331,
   'std test mse': 0.0025964461362936717,
   'avg train mse': 0.03912555532222762,
   'min train mse': 0.03826509713666593,
   'max train mse': 0.03973209877367365,
   'std train mse': 0.0006491115339670761},
  {'model': 'quantile_pca_kmeans',
   'cluster': 1,
   'cluster size': 1927,
   'avg test mse': 0.04684330144864949,
   'min test mse': 0.045478440763133794,
   'max test mse': 0.048205211209200065,
   'std test mse': 0.001028734768655294,
   'avg train mse': 0.046842267572636834,
   'min train mse': 0.04650188740866388,
   'max train mse': 0.047183801797254316,
   'std train mse': 0.00025723469670014355},
  {'model': 'quantile_pca_kmeans',
   'cluster': 2,
   'cluster size': 1309,
   'avg test mse': 0.04659617419136196,
   'min test mse': 0.043749750784397405,
  