In [49]:
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from dtaidistance import dtw


In [50]:
folder_cluster = "./data/clusters/"
clusters_type = ["cluster_ed","cluster_dtw"]

folder = "./data/20210921/"
dfp_train = pd.read_csv(folder + "train.csv")
dfp_test = pd.read_csv(folder + "test.csv")
print(len(dfp_train))

6036000


In [51]:
def get_clusters(dfp_breath_id, dfp_clusters, cluster_type):
    u_in = dfp_breath_id["u_in"].tolist()
    X = np.concatenate((np.array([u_in]), np.array(dfp_clusters)), axis=0)
    if cluster_type == "cluster_ed":
        distance_X = euclidean_distances(X, X)
    else:
        distance_X = dtw.distance_matrix_fast(X)

    cluster_idx_sorted = np.argsort(distance_X[0,1:]) - 1
    closest_cluster = dfp_clusters.index[cluster_idx_sorted[0]]
    farest_cluster = dfp_clusters.index[cluster_idx_sorted[-1]]
    
    return closest_cluster, farest_cluster

In [52]:
for cluster_type in clusters_type:
    print(cluster_type)
# for cluster_type in ["cluster_dtw"]:
    
    dfp_clusters = pd.read_csv(folder_cluster + cluster_type + ".csv", index_col=0)
    
    print('Work on the training data')
    data = []
    for breath_id in tqdm(dfp_train["breath_id"].unique()):
        dfp_breath_id = dfp_train[dfp_train["breath_id"] == breath_id].sort_values(["time_step"])
        closest_cluster, farest_cluster = get_clusters(dfp_breath_id, dfp_clusters, cluster_type)
        data.append([breath_id, closest_cluster, farest_cluster])

    dfp_breath_id_assignation_train = pd.DataFrame(data, columns=["breath_id", "closest_cluster", "farest_cluster"])
    dfp_breath_id_assignation_train.to_csv(folder_cluster + f"train_{cluster_type}_assignations.csv")
    
    print('Work on the testing data')
    data = []
    for breath_id in tqdm(dfp_test["breath_id"].unique()):
        dfp_breath_id = dfp_test[dfp_test["breath_id"] == breath_id].sort_values(["time_step"])
        closest_cluster, farest_cluster = get_clusters(dfp_breath_id, dfp_clusters, cluster_type)
        data.append([breath_id, closest_cluster, farest_cluster])

    dfp_breath_id_assignation_test= pd.DataFrame(data, columns=["breath_id", "closest_cluster", "farest_cluster"])
    dfp_breath_id_assignation_test.to_csv(folder_cluster + f"test_{cluster_type}_assignations.csv")

  0%|          | 14/75450 [00:00<09:05, 138.27it/s]

cluster_ed
Work on the training data


100%|██████████| 75450/75450 [07:12<00:00, 174.33it/s]
  0%|          | 24/50300 [00:00<03:36, 232.31it/s]

Work on the testing data


100%|██████████| 50300/50300 [03:34<00:00, 234.52it/s]
  0%|          | 1/75450 [00:00<2:07:06,  9.89it/s]

cluster_dtw
Work on the training data


100%|██████████| 75450/75450 [2:22:49<00:00,  8.80it/s]  
  0%|          | 1/50300 [00:00<1:25:06,  9.85it/s]

Work on the testing data


100%|██████████| 50300/50300 [1:33:57<00:00,  8.92it/s]


In [44]:
dfp_breath_id_assignation

Unnamed: 0,breath_id,closest_cluster,farest_cluster
0,1,3,333
1,2,0,333
2,3,0,333
3,4,159,333
4,5,1,333
...,...,...,...
75445,125740,0,333
75446,125742,162,371
75447,125743,161,333
75448,125745,269,333


In [21]:
X

array([[8.33340056e-02, 1.83830415e+01, 2.25092778e+01, ...,
        4.98184740e+00, 4.98468316e+00, 4.98707904e+00],
       [9.34926139e+01, 8.61411015e+01, 7.73674700e+01, ...,
        4.96487447e+00, 4.97019453e+00, 4.97457264e+00],
       [3.97307938e+01, 6.97816926e+01, 5.54851577e+01, ...,
        4.60730630e+00, 4.61542084e+00, 4.62279238e+00],
       ...,
       [2.60718857e+00, 3.68108018e+01, 4.82378274e+01, ...,
        4.98074363e+00, 4.98373708e+00, 4.98628498e+00],
       [1.08891811e+01, 3.09958103e+01, 3.37710576e+01, ...,
        4.98430053e+00, 4.98677883e+00, 4.98884842e+00],
       [4.87712284e+00, 2.10060090e+01, 2.80100161e+01, ...,
        4.96079197e+00, 4.96662859e+00, 4.97153351e+00]])