In [1]:
from pathlib import Path
import shutil as sh
import json
import pandas as pd
import numpy as np

models_path = Path('../runs/2-18-shuffle/models')
clusters_path = Path('../runs/2-18-shuffle/clusters')
stats_path = Path('../runs/2-18-shuffle/stats')

if not stats_path.exists():
    stats_path.mkdir()


for model in models_path.iterdir():
    if not model.is_dir():
        continue
    sh.copy(str(model / 'stats.json') , str(stats_path / f'{model.stem}.json') )
    
stats_files = list(stats_path.iterdir())

In [2]:
stats_df = pd.DataFrame()
for stat_f in stats_files:
    print("========= " + stat_f.stem + " ==========")
    
    with open(stat_f, 'r') as f:
        stats = json.load(f)
        
        for k, run in stats.items():
            print(f'--- {k}: {len(run)} ---') 
            
            print(k, run)
            
            loss = 0
            cluster_sizes = []
            for cluster in run:
                loss += cluster['loss']*cluster['cluster_size']
                cluster_sizes.append(cluster['cluster_size'])
            loss /= sum(cluster_sizes) 
            
            stats_df = pd.concat([stats_df, pd.DataFrame([
                    {'model': stat_f.stem, 
                    'run': k, 
                    'cluster sizes':cluster_sizes, 
                    'nclusters': len(run),
                    'total_length': sum([c['cluster_size'] for c in run]),
                    'avg loss': loss,
                    'min loss' : min([c['loss'] for c in run]),
                    'max loss': max([c['loss'] for c in run]),
                    'std loss': np.std([c['loss'] for c in run]),}
                ])
            ])
                
            print("Average Cluster Loss: ", loss)
stats_df.reset_index(drop=True, inplace=True)
stats_df[['run', 'nclusters']] = stats_df[['run', 'nclusters']].astype(int)
        
    

--- 2: 2 ---
2 [{'run': 2, 'cluster': 0, 'loss': 0.07977649569511414, 'mse': 0.07977649569511414, 'cluster_size': 5851}, {'run': 2, 'cluster': 1, 'loss': 0.08126482367515564, 'mse': 0.08126482367515564, 'cluster_size': 5835}]
Average Cluster Loss:  0.08051964080580575
--- 3: 2 ---
3 [{'run': 3, 'cluster': 0, 'loss': 0.08215799182653427, 'mse': 0.08215799182653427, 'cluster_size': 5862}, {'run': 3, 'cluster': 1, 'loss': 0.08569347858428955, 'mse': 0.08569347858428955, 'cluster_size': 5841}]
Average Cluster Loss:  0.08392256314602915
--- 4: 2 ---
4 [{'run': 4, 'cluster': 0, 'loss': 0.08146019279956818, 'mse': 0.08146019279956818, 'cluster_size': 5864}, {'run': 4, 'cluster': 1, 'loss': 0.08886979520320892, 'mse': 0.08886979520320892, 'cluster_size': 5847}]
Average Cluster Loss:  0.08515961601313554
--- 5: 2 ---
5 [{'run': 5, 'cluster': 0, 'loss': 0.08157236129045486, 'mse': 0.08157236129045486, 'cluster_size': 5872}, {'run': 5, 'cluster': 1, 'loss': 0.0837041363120079, 'mse': 0.0837041363

# Model Results

In [3]:
stats_df

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
0,pca_dbscan_mag,2,"[5851, 5835]",2,11686,0.080520,0.079776,0.081265,0.000744
1,pca_dbscan_mag,3,"[5862, 5841]",2,11703,0.083923,0.082158,0.085693,0.001768
2,pca_dbscan_mag,4,"[5864, 5847]",2,11711,0.085160,0.081460,0.088870,0.003705
3,pca_dbscan_mag,5,"[5872, 5848]",2,11720,0.082636,0.081572,0.083704,0.001066
4,pca_dbscan_mag,6,"[5873, 5848]",2,11721,0.083222,0.081072,0.085382,0.002155
...,...,...,...,...,...,...,...,...,...
84,quantile_pca_dbscan,2,"[5840, 4501, 1283]",3,11624,0.076750,0.047270,0.081880,0.015579
85,quantile_pca_dbscan,3,"[5854, 4529, 1295]",3,11678,0.077811,0.045706,0.083139,0.017119
86,quantile_pca_dbscan,4,"[5873, 4533, 1295]",3,11701,0.077935,0.050936,0.082233,0.014378
87,quantile_pca_dbscan,5,"[5881, 4543, 1305]",3,11729,0.077511,0.048626,0.081180,0.015317


## Best Models

In [4]:
top10 = list(stats_df['avg loss'].nsmallest(15).index)
stats_df.loc[top10]

Unnamed: 0,model,run,cluster sizes,nclusters,total_length,avg loss,min loss,max loss,std loss
51,quantile_pca_kmeans,6,"[1845, 1927, 1309, 1619, 1298, 1493, 1099, 1181]",8,11771,0.04903,0.040163,0.076036,0.010802
68,ts_kmeans_mag,4,"[3072, 2397, 1729, 1733, 1071, 1769]",6,11771,0.050221,0.041855,0.071897,0.010366
9,pca_kmeans_mag,4,"[3072, 2399, 1729, 1733, 1071, 1767]",6,11771,0.050225,0.043077,0.074703,0.011178
50,quantile_pca_kmeans,5,"[1845, 2609, 1309, 1619, 1430, 1860, 1099]",7,11771,0.050742,0.03469,0.068599,0.009666
8,pca_kmeans_mag,3,"[3069, 3236, 1732, 2663, 1071]",5,11771,0.052838,0.042964,0.062948,0.006371
25,quantile_pca_agg,3,"[3325, 3013, 1309, 2886, 1238]",5,11771,0.053814,0.044633,0.060799,0.005159
67,ts_kmeans_mag,3,"[3070, 3236, 1731, 2663, 1071]",5,11771,0.054086,0.041548,0.068264,0.008606
49,quantile_pca_kmeans,4,"[2123, 2631, 1309, 2440, 1460, 1808]",6,11771,0.056196,0.044667,0.071563,0.009166
48,quantile_pca_kmeans,3,"[2123, 2737, 1309, 2440, 3162]",5,11771,0.0589,0.048133,0.071073,0.008423
63,robust_pca_kmeans,2,[2675],1,2675,0.060996,0.060996,0.060996,0.0


# Global MSE

In [5]:
from keras.models import load_model
from pathlib import Path
import pandas as pd
from pickle import load
import gc

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import QuantileTransformer

data_path = Path('../../data/compiled/')

models = stats_df.loc[top10]["model"]
runs = stats_df.loc[top10]["run"]

mse_dict = {'model':[], 'run':[], 'mse':[]}

2023-02-18 16:04:24.214588: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-18 16:04:24.929185: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-18 16:04:24.929228: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-18 16:04:27.191353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
def load_original_data(data_path: Path, save_scalers : bool = False):
    """Load the original data from the file."""

    inputs = pd.read_csv(data_path / 'inputsdata_compilation.csv')
    outputs = pd.read_csv(data_path / 'outputsdata_compilation.csv')
    
    input_filenames = inputs[['filename']]
    output_filenames = outputs[['filename']]
    
    scaler_inputs, scaler_ouputs = QuantileTransformer(), QuantileTransformer()
    inputs = scaler_inputs.fit_transform(inputs.iloc[:, 1:])
    outputs = scaler_ouputs.fit_transform(outputs.iloc[:, 1:])
    
    inputs = pd.DataFrame(inputs)
    inputs = pd.concat([input_filenames, inputs], axis=1)
    
    outputs = pd.DataFrame(outputs)
    outputs = pd.concat([output_filenames, outputs], axis=1)
    
    print("Scaled inputs:", inputs.head())
    print("Scaled outputs:", outputs.head())
    return inputs, outputs, scaler_inputs, scaler_ouputs


def join_files_in_cluster(cluster_files, input_data : pd.DataFrame, output_data : pd.DataFrame):
    """Join all files in a cluster into a single dataframe."""
    cluster_inputs, cluster_outputs = pd.DataFrame(), pd.DataFrame()
    
    inputs = [input_data.loc[input_data['filename'] == f].iloc[:, 1:]
              for f in cluster_files]
    
    cluster_inputs = pd.concat(inputs, axis=0, ignore_index=True)
    
    outputs = [output_data.loc[output_data['filename'] == f].iloc[:, 1:]
               for f in cluster_files]
    cluster_outputs = pd.concat(outputs, axis=0, ignore_index=True)      
    
    # print(cluster_inputs.head())
    # print(cluster_inputs.shape)
    # print(cluster_df)
    # print(cluster_df.shape)
    # print(cluster_df.columns)
    print("Cluster shape:", cluster_inputs.shape)
    return cluster_inputs, cluster_outputs


In [7]:
original_in, original_out, _scaler_in, scaler_out = load_original_data(data_path)

clusters_confs = [load(open(clusters_path / f"{m}.pkl", 'rb')) for m in models]

for conf, run in zip(clusters_confs, runs):
    clusters = conf[run]['clusters']
    all_predictions = []
    cluster_filenames = []
    try:
        for cluster_id, cluster in clusters.items():
            model_file = f"{conf[run]['method']}_run{conf[run]['run_id']}_{cluster_id}.h5"
            model = load_model(models_path / f"{conf[run]['method']}" / model_file)
            
            inputs, _outputs = join_files_in_cluster(cluster, original_in, original_out)
            
            predictions = model.predict(inputs)
            print(predictions.shape)
            
            cluster_filenames.extend(cluster)
            all_predictions.append(pd.DataFrame(predictions))
    except:
        continue

    all_predictions = pd.concat(all_predictions, ignore_index=True)

    mse = mean_squared_error(original_out.iloc[:, 1:], all_predictions)
    mse_dict['model'].append(conf[run]['method'])
    mse_dict['run'].append(conf[run]['run_id'])
    mse_dict['mse'].append(mse)
    
    gc.collect()

    # # do inverse tranform on the predictions
    # all_predictions.columns = all_predictions.columns.astype(str)
    # all_predictions = scaler_out.inverse_transform(all_predictions)
    # all_predictions = pd.DataFrame(all_predictions)

    # if opts['plots']:
    #     plot_cluster_preds(all_predictions, opts['cluster_file'], out_dir)

    # all_predictions = pd.concat([pd.DataFrame(cluster_filenames, columns=["filename"]), all_predictions], axis=1)
    # if not opts['no_write']:    
    #     print(all_predictions.head())
    #     all_predictions.to_csv(out_dir / f"predictions_compiled.csv", index=False)

    # print(original_out.shape, all_predictions.shape)
    # print("MSE: ", mse)

    # with open('metrics.txt', 'a') as f:
    #     f.write(f"{opts['cluster_file']} : {opts['run_id']} : {mse}\n")
        






Scaled inputs:                        filename    0         1         2         3         4  \
0  profile_wso_CR1992_line_0070  0.0  0.502994  0.499827  0.499132  0.502322   
1  profile_wso_CR1992_line_0073  0.0  0.624102  0.622296  0.623055  0.623179   
2  profile_wso_CR1992_line_0075  0.0  0.734200  0.731681  0.732004  0.731442   
3  profile_wso_CR1992_line_0077  0.0  0.390270  0.387673  0.386860  0.387348   
4  profile_wso_CR1992_line_0078  0.0  0.636728  0.634007  0.635993  0.635896   

          5         6         7         8  ...  1910  1911  1912  1913  1914  \
0  0.497383  0.500909  0.493741  0.496144  ...   0.0   0.0   0.0   0.0   0.0   
1  0.621695  0.623463  0.619049  0.622395  ...   0.0   0.0   0.0   0.0   0.0   
2  0.731776  0.735525  0.732279  0.733841  ...   0.0   0.0   0.0   0.0   0.0   
3  0.386650  0.388422  0.381472  0.383510  ...   0.0   0.0   0.0   0.0   0.0   
4  0.635580  0.637827  0.634017  0.636942  ...   0.0   0.0   0.0   0.0   0.0   

   1915  1916  1917  19

2023-02-18 16:05:05.779823: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-18 16:05:05.781292: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-18 16:05:05.782959: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (archlinux): /proc/driver/nvidia/version does not exist
2023-02-18 16:05:05.795317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Cluster shape: (1845, 1920)


2023-02-18 16:05:18.090962: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 28339200 exceeds 10% of free system memory.


(1845, 1920)
Cluster shape: (1927, 1920)


2023-02-18 16:05:30.659157: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 29598720 exceeds 10% of free system memory.


(1927, 1920)
Cluster shape: (1309, 1920)
(1309, 1920)
Cluster shape: (1619, 1920)
(1619, 1920)
Cluster shape: (1298, 1920)
(1298, 1920)
Cluster shape: (1493, 1920)
(1493, 1920)
Cluster shape: (1099, 1920)
(1099, 1920)
Cluster shape: (1181, 1920)
(1181, 1920)
Cluster shape: (3072, 1920)


2023-02-18 16:06:59.102771: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 47185920 exceeds 10% of free system memory.


(3072, 1920)
Cluster shape: (2397, 1920)


2023-02-18 16:07:16.940786: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 36817920 exceeds 10% of free system memory.


(2397, 1920)
Cluster shape: (1729, 1920)


2023-02-18 16:07:29.375493: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 26557440 exceeds 10% of free system memory.


(1729, 1920)
Cluster shape: (1733, 1920)
(1733, 1920)
Cluster shape: (1071, 1920)
(1071, 1920)
Cluster shape: (1769, 1920)
(1769, 1920)
Cluster shape: (3072, 1920)
(3072, 1920)
Cluster shape: (2399, 1920)
(2399, 1920)
Cluster shape: (1729, 1920)
(1729, 1920)
Cluster shape: (1733, 1920)
(1733, 1920)
Cluster shape: (1071, 1920)
(1071, 1920)
Cluster shape: (1767, 1920)
(1767, 1920)
Cluster shape: (1845, 1920)
(1845, 1920)
Cluster shape: (2609, 1920)
(2609, 1920)
Cluster shape: (1309, 1920)
(1309, 1920)
Cluster shape: (1619, 1920)
(1619, 1920)
Cluster shape: (1430, 1920)
(1430, 1920)
Cluster shape: (1860, 1920)
(1860, 1920)
Cluster shape: (1099, 1920)
(1099, 1920)
Cluster shape: (3069, 1920)
(3069, 1920)
Cluster shape: (3236, 1920)
(3236, 1920)
Cluster shape: (1732, 1920)
(1732, 1920)
Cluster shape: (2663, 1920)
(2663, 1920)
Cluster shape: (1071, 1920)
(1071, 1920)
Cluster shape: (3325, 1920)
(3325, 1920)
Cluster shape: (3013, 1920)
(3013, 1920)
Cluster shape: (1309, 1920)
(1309, 1920)
Clu

In [9]:
pd.DataFrame(mse_dict)

Unnamed: 0,model,run,mse
0,quantile_pca_kmeans,6,0.120959
1,ts_kmeans_mag,4,0.133572
2,pca_kmeans_mag,4,0.132395
3,quantile_pca_kmeans,5,0.117829
4,pca_kmeans_mag,3,0.11712
5,quantile_pca_agg,3,0.11288
6,ts_kmeans_mag,3,0.115225
7,quantile_pca_kmeans,4,0.104702
8,quantile_pca_kmeans,3,0.102568
9,pca_kmeans_mag,2,0.10568
