# Building Energy Profiles clustering 

## Import libraries for analysis

In [45]:
# Built-in libraries
import os
import time
from math import log2
from urllib.parse import urlencode

# NumPy, SciPy and Pandas
import numpy as np
import pandas as pd

# Scikit-Learn
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
%matplotlib inline

## Read clustering result file to generate various plots 

* this file contains daily profiles, clustering assignment, and metadata 
* this section below reads the profiles generated in the previous section

In [20]:
combined_profiles = pd.read_csv('final_profiles.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Separate residential and non-residential buildings
is_residential = combined_profiles.Industry == 'Residential'
residential_profiles = combined_profiles.loc[is_residential, :]
non_residential_profiles = combined_profiles.loc[~is_residential, :]

# reset index
residential_profiles.reset_index(inplace = True, drop = True)
non_residential_profiles.reset_index(inplace = True, drop = True)

## Plot saving functions

* Since we have multipe plots from this analysis, we generate a saving function for handling all these plots
* We describe different types of plot as follows:

### Plot - type 1. The first plots are basic exploratory data analysis 
* 1.1 All the profiles (or samples) with clustering type color coded
* 1.2 Averaged profile for each cluster

In [4]:
from plot_functions.profiles import save_profile_plots

### Plot - type 2. Clustering analysis by numerical metadata 
* 2.1 Cluster assignment by sqm
* 2.2 Cluster assignment by EUI
* 2.3 Cluster assignment by entropy
* 2.4 Cluster assignment by day-of-year

In [5]:
from plot_functions.numerical import save_continuous_plots

### Plot - type 3. Clustering analysis by categorical metadata 
* 3.1 Cluster assignment by timezone (region)
* 3.2 Cluster assignment by industry
* 3.3 Cluster assignment by sub-industry
* 3.4 Cluster assignment by primary-space-usage
* 3.5 Cluster assignment by date-flag (weekday, weekend, holiday)


In [6]:
from plot_functions.stacked import save_stacked_bars

### Plot - type 4. Entropy distribution analysis 
* 4.1 Entropy distribution by profiles 
* 4.2 Entropy distribution by buildings
* 4.3 Entropy distribution by cluster assignment

* 4~8 categorical metadata
* 4.4 Entropy distribution by timezone
* 4.5 Entropy distribution by industry
* 4.6 Entropy distribution by sub-industry
* 4.7 Entropy distribution by primary-space-usage
* 4.8 Entropy distribution by date-flag

In [7]:
from plot_functions.entropy import save_entropy_distribution, save_field_level_entropy_distribution

## A complete package for running all the plots with different clustering algorithm

* K-means clustering
* Hierarchical clustering

### Calculation for profile variation (entropy)
* these functions are invoked after each iteration of clustering

In [8]:
def calc_entropy(pd_labels):
    result = 0
    vals = pd_labels.value_counts()
    total = pd_labels.shape[0]
    for val in vals:
        result -= val/total * log2(val/total)
    return result

def get_list_of_entropies(profiles):
    return profiles.groupby('Building')[['cluster']].transform(calc_entropy)

### Assign basic settings for clustering analysis

* K-means clustering, and Hierarchical clustering. Testing various clustering numbers (K, from 1 to 10).

In [48]:
# params_list should be a list of dictionaries containing the key-value paris for the input parameters of the models
# e.g. KMeans -> {'k' : 3}

class ClusteringBaseModel:
    def __init__(self, params_list, profiles, cluster_algo, save_dir, labels_dir):
        self.algorithms = {
            'KMeans': KMeans,
            'DBSCAN': DBSCAN,
            'AgglomerativeClustering': AgglomerativeClustering,
            'GMM': GaussianMixture
        }
        
        self.params_list = params_list
        self.profiles = profiles
        self.cluster_algo = cluster_algo
        self.save_dir = '%s/%s/' % (save_dir, cluster_algo.lower())
        self.labels_dir = labels_dir
        
    def run(self):
        for params in self.params_list:
            # Fit model with varying k
            self.model = self.init_model(params)

            if 'params[%s].npy' % urlencode(params) in os.listdir('./%s/%s/' % (self.labels_dir, self.cluster_algo.lower())):
                print('Skip model fitting due to saved labels [params: %s]' % (urlencode(params)))
                labels = np.load('./%s/%s/params[%s].npy' % (self.labels_dir, self.cluster_algo.lower(), urlencode(params)))
            else:
                start = time.time()
                labels = self.get_labels(self.profiles.iloc[:, 3:3+24])
                print('Time spent fitting model: %.4f [params: %s]' % (time.time()-start, urlencode(params)))
                np.save('./%s/%s/params[%s].npy' % (self.labels_dir, self.cluster_algo.lower(), urlencode(params)), labels)

            self.profiles['cluster'] = labels

            # Calculate entropies
            start = time.time()
            self.profiles['entropy'] = get_list_of_entropies(self.profiles)
            print('Time spent on calculating entropies: %fs' % (time.time() - start))

            # Generate plots
            save_profile_plots(self.cluster_algo, params, self.profiles, self.save_dir)
            save_continuous_plots(self.cluster_algo, params, self.profiles, self.save_dir)
            save_stacked_bars(self.cluster_algo, params, self.profiles, self.save_dir)
            save_entropy_distribution(self.cluster_algo, params, self.profiles, self.save_dir)
            save_field_level_entropy_distribution(self.cluster_algo, params, self.profiles, self.save_dir)

    # Overwrite this method in the child classes
    def init_model(self, params):
        raise Exception('Not implemented')

    # Overwrite this method in the child classes
    def get_labels(self, data):
        raise Exception('Not implemented')

In [49]:
def get_labels_default(model, data):
    model.fit(data)
    return model.labels_

class KmeansClustering(ClusteringBaseModel):
    def __init__(self, params_list, profiles, save_dir, labels_dir):
        super().__init__(params_list, profiles, 'KMeans', save_dir, labels_dir)
    
    def init_model(self, params):
        return self.algorithms[self.cluster_algo](n_clusters=params['k'])
        
    def get_labels(self, data):
        return get_labels_default(self.model, data)

class DBSCANClustering(ClusteringBaseModel):
    def __init__(self, params_list, profiles, save_dir, labels_dir):
        super().__init__(params_list, profiles, 'DBSCAN', save_dir, labels_dir)

    def init_model(self, params):
        return self.algorithms[self.cluster_algo](min_samples=params['min_samples'], eps=params['eps'])

    def get_labels(self, data):
        return get_labels_default(self.model, data)

class HierarchicalClustering(ClusteringBaseModel):
    def __init__(self, k_range, profiles, save_dir, labels_dir):
        super().__init__(k_range, profiles, 'AgglomerativeClustering', save_dir, labels_dir)

    def init_model(self, params):
        return self.algorithms[self.cluster_algo](n_clusters=params['k'], linkage=params['linkage'])

    def get_labels(self, data):
        return get_labels_default(self.model, data)

class GMMClustering(ClusteringBaseModel):
    def __init__(self, k_range, profiles, save_dir, labels_dir):
        super().__init__(k_range, profiles, 'GMM', save_dir, labels_dir)

    def init_model(self, params):
        return self.algorithms[self.cluster_algo](n_components=params['k'])

    def get_labels(self, data):
        self.model.fit(data)
        return self.model.predict(data)

class BisectingKMeansClustering(ClusteringBaseModel):
    def __init__(self, k_range, profiles, save_dir, labels_dir):
        super().__init__(k_range, profiles, 'BisectingKMeans', save_dir, labels_dir)
    
    # Store params in model
    def init_model(self, params):
        return params
    
    def bisect(self, data, index):
        model = KMeans(n_clusters=2)
        model.fit(data)
        return [(data[model.labels_ == 0, :], index[model.labels_ == 0]), (data[model.labels_ == 1, :], index[model.labels_ == 1])]
    
    def calc_mse(self, data):
        mu = np.mean(data, axis=0)
        return np.sum((data - mu)**2) / data.shape[0]
    
    def bisect_k(self, data, k):
        data_dict = {}
        mse_dict = {}
        index_dict = {}
        index_array = np.arange(data.shape[0])
        final_labels = np.zeros(data.shape[0])
        for i in range(k-1):
            i1 = i*2
            i2 = i*2+1
            (data_dict[i1], index_dict[i1]), (data_dict[i2], index_dict[i2]) = self.bisect(data, index_array)
            mse_dict[i1] = self.calc_mse(data_dict[i1])
            mse_dict[i2] = self.calc_mse(data_dict[i2])

            if i < k-2:
                max_mse_idx = max(mse_dict.items(), key=lambda x:x[1])[0]
                data = data_dict[max_mse_idx]
                index_array = index_dict[max_mse_idx]
                del mse_dict[max_mse_idx]
                del data_dict[max_mse_idx]
                del index_dict[max_mse_idx]

        # reconstruct labels
        cluster_indices_groups = [a[1] for a in sorted(list(index_dict.items()), key=lambda x: x[0])]
        cluster_id = 0
        for cluster_indices in cluster_indices_groups:
            final_labels[cluster_indices] = cluster_id
            cluster_id += 1
        return final_labels
    
    def get_labels(self, data):
        data = data.as_matrix()
        return self.bisect_k(data, self.model['k'])

In [41]:
k_params = [{'k': i} for i in range(2,11)]

In [42]:
directories = {
    'profiles': {
        'save_dir': 'final_plots',
        'labels_dir': 'final_labels'
    },
    'residential_profiles': {
        'save_dir': 'residential_plots',
        'labels_dir': 'residential_labels'
    },
    'non_residential_profiles': {
        'save_dir': 'non_residential_plots',
        'labels_dir': 'non_residential_labels'
    }
}

def get_save_dir(name):
    return directories[name]['save_dir']

def get_labels_dir(name):
    return directories[name]['labels_dir']

In [43]:
dset_map = {
    'profiles': combined_profiles,
    'residential_profiles': residential_profiles,
    'non_residential_profiles': non_residential_profiles
}

In [52]:
for dset_name in dset_map:
    profiles = dset_map[dset_name]
    
    kmeansClustering = KmeansClustering(k_params, profiles, get_save_dir(dset_name), get_labels_dir(dset_name))
    kmeansClustering.run()
    
    bisectingKmeansClustering = BisectingKMeansClustering(k_params, profiles, get_save_dir(dset_name), get_labels_dir(dset_name))
    bisectingKmeansClustering.run()
    
    gmmClustering = GMMClustering(k_params, profiles, get_save_dir(dset_name), get_labels_dir(dset_name))
    gmmClustering.run()