In [4]:
# Result structure:
# |--result
#    |--[*DataSet*](Irish_2010&London_2013)
#    |  |--clustering
#    |     |--interval
#    |        |--hierarchical
#    |        |  |--euclidean
#    |        |  |--cityblock
#    |        |  |--hausdorff
#    |        |--kmeans


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

In [5]:
data_set = 'Irish_2010'
attr = pd.read_csv('../data/' + data_set + '_attr_final.csv')

In [3]:
# K-means clustering

for num_clusters in range(1, 11):
    labels = []
    for month in tqdm(range(12)):

        X = []
        for i in range(len(attr)):
            id = attr['ID'][i]
            df = pd.read_csv('../data/' + data_set + '_profiles_interval/' + str(id) + '.csv', header = None).values
            X.append(np.hstack((df[month*2], df[month*2+1])))
        X = np.array(X)
        X = (X - np.min(X))/(np.max(X) - np.min(X))

        kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, 
                        verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='full').fit(X)
        labels.append(kmeans.labels_)

    pd.DataFrame(np.array(labels).T).to_csv('../result/' + data_set + '/clustering/interval/kmeans/n_clusters_' + str(num_clusters) + '.csv', header=None, index=False)

100%|██████████| 12/12 [00:23<00:00,  1.90s/it]
100%|██████████| 12/12 [00:24<00:00,  2.02s/it]
100%|██████████| 12/12 [00:24<00:00,  1.99s/it]
100%|██████████| 12/12 [00:23<00:00,  1.92s/it]
100%|██████████| 12/12 [00:26<00:00,  2.26s/it]
100%|██████████| 12/12 [00:25<00:00,  2.07s/it]
100%|██████████| 12/12 [00:27<00:00,  2.27s/it]
100%|██████████| 12/12 [00:26<00:00,  2.16s/it]
100%|██████████| 12/12 [00:26<00:00,  2.14s/it]
100%|██████████| 12/12 [00:24<00:00,  2.03s/it]


In [24]:
# Hierarchical clustering

dist = 'hausdorff'
for num_clusters in range(1, 11):
    labels = []
    for month in tqdm(range(12)):

        mat = pd.read_csv('../result/' + data_set + '/clustering/interval/hierarchical/' + dist + '/mat_month_' + str(month+1) + '.csv', header=None).values
        hierarchical = AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', memory=None, connectivity=None, 
                                               compute_full_tree='auto', linkage='average', pooling_func='deprecated', distance_threshold=None).fit(mat)
        labels.append(hierarchical.labels_)

    pd.DataFrame(np.array(labels).T).to_csv('../result/' + data_set + '/clustering/interval/hierarchical/' + dist + '/n_clusters_' + str(num_clusters) + '.csv', header=None, index=False)

100%|██████████| 12/12 [00:13<00:00,  1.07s/it]
100%|██████████| 12/12 [00:12<00:00,  1.05s/it]
100%|██████████| 12/12 [00:12<00:00,  1.06s/it]
100%|██████████| 12/12 [00:12<00:00,  1.04s/it]
100%|██████████| 12/12 [00:12<00:00,  1.08s/it]
100%|██████████| 12/12 [00:13<00:00,  1.10s/it]
100%|██████████| 12/12 [00:12<00:00,  1.11s/it]
100%|██████████| 12/12 [00:12<00:00,  1.08s/it]
100%|██████████| 12/12 [00:12<00:00,  1.03s/it]
100%|██████████| 12/12 [00:12<00:00,  1.03s/it]
