# Calculate the similarity metric with same profiles as a previous result

This notebook reads the info_df of a previous result and calculates a new distance matrix with the same profiles.

### Set-up

In [None]:
from energyclustering.data.fluvius import read_data_pickle
from energyclustering.clustering.similarity.distmatrix import calculate_distance_matrix
import energyclustering.clustering.similarity.matching_similarity as simple_matching
from energyclustering.clustering.similarity.histogram import WassersteinDistance
import numpy as np 
import pandas as pd 
from dask.distributed import Client, LocalCluster
from pathlib import Path

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# path where the resulting files will be stored
# RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/small_random1_no_data_issues/')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/full_distance_matrix_wasserstein/')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)


### Read the the data 

In [None]:
info_df, data_df = read_data_pickle(include_incomplete_profiles = True, process_errors = True)

### Sample some profiles

In [None]:
# data_df = data_df.sample(100, random_state = 12341234)
# info_df = info_df.loc[data_df.index]

### Write the data

In [None]:
# write to file
data_df.to_pickle(RESULT_PATH/'data.pkl')
info_df.to_pickle(RESULT_PATH/'info.pkl')

### Calculate the distance matrix

In [None]:
%%time
cluster = LocalCluster(n_workers = 20, threads_per_worker = 2, local_directory = '/cw/dtailocal/jonass')
with Client(cluster) as client: 
    dist_df = calculate_distance_matrix(data_df, WassersteinDistance('1H'), client)
distance_df.to_pickle(RESULT_PATH/'full_distance_matrix.pkl')
cluster.close()

In [None]:
dist_df.to_pickle(RESULT_PATH/'full_distance_matrix.pkl')