# Calculate the similarity metric with same profiles as a previous result

This notebook reads the info_df of a previous result and calculates a new distance matrix with the same profiles.

### Set-up

In [None]:
from energyclustering.data.fluvius import read_data_pickle
import energyclustering.clustering.similarity.baselines as baselines
import energyclustering.clustering.similarity.histogram as histosim
import energyclustering.clustering.similarity.matching_similarity as matching_similarity
from energyclustering.clustering.DTW import get_DTW_distance_matrix
import numpy as np 
import pandas as pd 
from dask.distributed import Client, LocalCluster
from pathlib import Path
import itertools
from tqdm import tqdm

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# path where the resulting files will be stored
PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')
REFERENCE_PATH = PATH/ 'random_profiles_no_data_issues_v1'
RESULT_PATH = PATH /'baseline_euclidean'
RESULT_PATH_DTW = PATH/'baseline_dtw'
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
RESULT_PATH_DTW.mkdir(mode = 0o770, parents = True, exist_ok=True)


### Read the the data (use the same as Koen)

In [None]:
info_df, data_df = pd.read_pickle(REFERENCE_PATH/'info.pkl'), pd.read_pickle(REFERENCE_PATH/'data.pkl')

### Write the data

In [None]:
# write to file


### Calculate the euclidean distance matrix

In [None]:
%%time
if not (RESULT_PATH/'full_distance_matrix.pkl').exists():
    data_df.to_pickle(RESULT_PATH/'data.pkl')
    info_df.to_pickle(RESULT_PATH/'info.pkl')
    dist_df = baselines.euclidean_distance_matrix(data_df)
    distance_df = pd.DataFrame(dist_df, index = data_df.index, columns = data_df.index)
    distance_df.to_pickle(RESULT_PATH/'full_distance_matrix.pkl')

### Calculate the histogram distance_matrix

In [None]:
for resample in [None, '1H', '4H']:
    result_path = PATH/f'wasserstein_{resample}'
    result_path.mkdir(mode = 0o770, parents = True, exist_ok=True)
    recalculate = True
    if recalculate or not (result_path/'full_distance_matrix.pkl').exists():
        data_df.to_pickle(result_path/'data.pkl')
        info_df.to_pickle(result_path/'info.pkl')
        distances = histosim.distance_matrix(data_df, resample = resample)
        distance_df = pd.DataFrame(distances, index = data_df.index, columns = data_df.index)
        distance_df.to_pickle(result_path/'full_distance_matrix.pkl')
    distances

### Calculate the DTW distance matrix
This is basically infeasible because of the innefficient DTW implementations (they allocate the full cost matrix while one a small part is actually used)

### Calculate variations of our distance 

In [None]:
distances = ['DTW', 'euclidean'] 
matchings = ['minimal_cost', 'one_to_one'] 
cluster = LocalCluster(n_workers = 40, threads_per_worker = 1, local_directory = '/cw/dtailocal/jonass')
# SCHEDULER = "himec07.cs.kuleuven.be:8786"
with Client(cluster) as client: 
    for distance, matching in tqdm(itertools.product(distances, matchings), total = 4): 
        if distance == 'DTW' and matching == 'minimal_cost': 
            continue 
        result_path = PATH / f'baselines_own_metric_{distance}_{matching}'
        result_path.mkdir(parents = True, exist_ok = True)
        if (result_path/'full_distance_matrix.pkl').exists():
            continue
        print(f'calculating {distance}_{matching}')
        distance_config = dict(distance_metric = distance, day_matching = matching, window = 4)
        dist_df = matching_similarity.distance_matrix(data_df, client, distance_config, total_blocks = 250)
        distance_df = pd.DataFrame(dist_df, index = data_df.index, columns = data_df.index)
        
        # save the results 
        data_df.to_pickle(result_path/'data.pkl')
        info_df.to_pickle(result_path/'info.pkl')
        distance_df.to_pickle(result_path/'full_distance_matrix.pkl')
# cluster.close()