In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from hausdorff import hausdorff_distance

In [10]:
data_set = 'Irish_2010'
attr = pd.read_csv('../data/' + data_set + '_attr.csv')

In [None]:
# Find the IDs with missing values

id_with_missing_values = []
for k in range(len(attr)):
    id = attr['ID'][k]
    df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header = None).values
    for i in range(len(df)):
        for j in range(len(df[0])):
            if df[i][j] == 0:
                id_with_missing_values.append(k)
                break

In [None]:
attr.drop(index=id_with_missing_values, inplace=True)
attr.reset_index(drop=True, inplace=True)
attr.to_csv('../data/' + data_set + '_attr_final.csv', index=None)

In [11]:
attr = pd.read_csv('../data/' + data_set + '_attr_final.csv')

In [None]:
# K-means clustering
num_clusters = 15

labels = []
for month in tqdm(range(12)):
    
    X = []
    for i in range(len(attr)):
        id = attr['ID'][i]
        df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header = None).values
        X.append(df[month])
    X = np.array(X)
    X = (X - np.min(X))/(np.max(X) - np.min(X))
    
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, 
                    verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='full').fit(X)
    labels.append(kmeans.labels_)

pd.DataFrame(np.array(labels).T).to_csv('../result/' + data_set + '/cluster/point/kmeans/n_clusters_' + str(num_clusters) + '.csv', header=None, index=False)

In [4]:
# Hierarchical clustering (euclidean&cityblock)
# num_clusters = 5

dist = 'cityblock'
for num_clusters in range(1, 17, 2):
    labels = []
    for month in tqdm(range(12)):

        X = []
        for i in range(len(attr)):
            id = attr['ID'][i]
            df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header = None).values
            X.append(df[month])
        X = np.array(X)
        X = (X - np.min(X))/(np.max(X) - np.min(X))

        hierarchical = AgglomerativeClustering(n_clusters=num_clusters, affinity=dist, memory=None, connectivity=None, 
                                               compute_full_tree='auto', linkage='average', pooling_func='deprecated', distance_threshold=None).fit(X)
        labels.append(hierarchical.labels_)

    pd.DataFrame(np.array(labels).T).to_csv('../result/' + data_set + '/cluster/point/hierarchical/' + dist + '/n_clusters_' + str(num_clusters) + '.csv', header=None, index=False)

100%|██████████| 12/12 [00:22<00:00,  1.83s/it]
100%|██████████| 12/12 [00:21<00:00,  1.77s/it]
100%|██████████| 12/12 [00:21<00:00,  1.84s/it]
100%|██████████| 12/12 [00:21<00:00,  1.76s/it]
100%|██████████| 12/12 [00:22<00:00,  1.95s/it]
100%|██████████| 12/12 [00:22<00:00,  1.87s/it]
100%|██████████| 12/12 [00:21<00:00,  1.81s/it]
100%|██████████| 12/12 [00:22<00:00,  1.87s/it]


In [12]:
# Hierarchical clustering (hausdorff)
num_clusters = 5

mat = np.zeros((len(attr), len(attr)))
for month in tqdm(range(12)):

    X = []
    for i in range(len(attr)):
        id = attr['ID'][i]
        df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header = None).values
        X.append(df[month])
    X = np.array(X)
    X = (X - np.min(X))/(np.max(X) - np.min(X))
    
    for i in range(len(attr)):
        for j in range(len(attr)):
            mat[i][j] = hausdorff_distance(X[i], X[j], distance="manhattan")
    
    break

  0%|          | 0/12 [00:00<?, ?it/s]


AssertionError: arrays must be 2-dimensional

## Clustering with additional attributes

In [None]:
attr = pd.read_csv('../data/' + data_set + '_attr_final.csv')
cate = np.zeros((len(attr)))

In [None]:
cate_1 = ['ACORN-A', 'ACORN-B', 'ACORN-C']  # Affluent Achievers
cate_2 = ['ACORN-D', 'ACORN-E']  # Rising Prosperity
cate_3 = ['ACORN-F', 'ACORN-G', 'ACORN-H', 'ACORN-I', 'ACORN-J']  # Comfortable Communities
cate_4 = ['ACORN-K', 'ACORN-L', 'ACORN-M', 'ACORN-N']  # Financially Stretched
cate_5 = ['ACORN-O', 'ACORN-P', 'ACORN-Q']  # Urban Adversity

for i in tqdm(range(len(attr))):
    if attr['Acorn'][i] in cate_1:
        cate[i] = 1
    if attr['Acorn'][i] in cate_2:
        cate[i] = 2
    if attr['Acorn'][i] in cate_3:
        cate[i] = 3
    if attr['Acorn'][i] in cate_4:
        cate[i] = 4
    if attr['Acorn'][i] in cate_5:
        cate[i] = 5

In [None]:
attr = pd.concat([attr, pd.DataFrame(cate, columns=['Cate']).astype('int64')], axis=1)

# Transform to 0-1
X_extra = pd.get_dummies(attr['Cate'], prefix='Cate').values

In [None]:
# K-means clustering
# num_clusters = 1

for num_clusters in tqdm(range(1, 17, 2)):
    labels = []
    for month in tqdm(range(12)):

        X = []
        for i in range(len(attr)):
            id = attr['ID'][i]
            df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header = None).values
            X.append(df[month])
        X = np.array(X)
        X = (X - np.min(X))/(np.max(X) - np.min(X))
        X = np.hstack((X, X_extra))

        kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, 
                        verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='full').fit(X)
        labels.append(kmeans.labels_)

    pd.DataFrame(np.array(labels).T).to_csv('../result/' + data_set + '/cluster/point/kmeans/n_clusters_' + str(num_clusters) + '_acorn.csv', header=None, index=False)

In [None]:
# Hierarchical clustering
# num_clusters = 5

dist = 'cityblock'
for num_clusters in range(1, 17, 2):
    labels = []
    for month in tqdm(range(12)):

        X = []
        for i in range(len(attr)):
            id = attr['ID'][i]
            df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header = None).values
            X.append(df[month])
        X = np.array(X)
        X = (X - np.min(X))/(np.max(X) - np.min(X))
        X = np.hstack((X, X_extra))

        hierarchical = AgglomerativeClustering(n_clusters=num_clusters, affinity=dist, memory=None, connectivity=None, 
                                               compute_full_tree='auto', linkage='average', pooling_func='deprecated', distance_threshold=None).fit(X)
        labels.append(hierarchical.labels_)

    pd.DataFrame(np.array(labels).T).to_csv('../result/' + data_set + '/cluster/point/hierarchical/' + dist + '/n_clusters_' + str(num_clusters) + '_acorn.csv', header=None, index=False)

## Plot

In [None]:
# Plot preparation
import matplotlib.pyplot as plt

data_set = 'London_2013'
method = 'kmeans'
num_clusters = 15
month = 7

attr = pd.read_csv('../data/' + data_set + '_attr_final.csv')
# labels = pd.read_csv('../result/' + data_set + '/cluster/point/' + method + '/n_clusters_' + str(num_clusters) + '.csv', header=None)
labels = pd.read_csv('../result/' + data_set + '/cluster/point/' + method + '/n_clusters_' + str(num_clusters) + '_acorn.csv', header=None)

In [None]:
# Plot
for i in range(len(attr)):
    id = attr['ID'][i]
    df = pd.read_csv('../data/' + data_set + '_profiles/' + str(id) + '.csv', header=None).values
    if labels[month-1][i] == 2:
        plt.plot(list(range(1, 25)), df[month-1], alpha=0.3, color='dodgerblue', linewidth=1)

# Plot Parameters
plt.xlabel('Time')
plt.ylabel('Load (kW)')
new_ticks = np.floor(np.linspace(0, 24, 7))
plt.xticks(new_ticks)
plt.xlim(1, 24)