# Heatmap Generation

## Import libraries

In [2]:
# Built-in libraries
import os
import re
import time
from datetime import datetime
import pytz
from itertools import compress
from math import log
import random

# NumPy, SciPy and Pandas
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import iqr
import pandas as pd

# Scikit-Learn
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import sqeuclidean
from sklearn.neighbors import NearestNeighbors

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
%matplotlib inline

# JoyPy
import joypy

# Workalendar
from workalendar.europe import Switzerland
from workalendar.europe import UnitedKingdom
from workalendar.usa import Colorado
from workalendar.usa import NewYork
from workalendar.usa import California
from workalendar.usa import Arizona
from workalendar.usa import Illinois
from workalendar.asia import Singapore
from workalendar.oceania import WesternAustralia

In [7]:
combined_profiles = pd.read_csv('final_profiles.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
# Separate residential and non-residential buildings
is_residential = combined_profiles.Industry == 'Residential'
residential_profiles = combined_profiles.loc[is_residential, :]
non_residential_profiles = combined_profiles.loc[~is_residential, :]

# reset index
residential_profiles.reset_index(inplace = True, drop = True)
non_residential_profiles.reset_index(inplace = True, drop = True)

### helper functions

In [8]:
def get_proportions(pd_labels):
    result = {}
    vals = pd_labels.value_counts()
    total = pd_labels.shape[0]
    for i in vals.index:
        result[i] = (vals.loc[i]/total)
    result['count'] = total
    return result

def get_list_of_proportions(profiles):
    return profiles.groupby('Building')[['cluster']].transform(get_proportions)

### core plotting settings and functions

In [9]:
def encode(series):
    enc = LabelEncoder()
    results = enc.fit_transform(series)
    series_values = sorted(set(results))
    enc.inverse_transform(series_values)
    return (results , (series_values , enc.inverse_transform(series_values)))

def minmax(series):
    return (series - series.min()) / (series.max() - series.min())

industry_colors = {
    'Education': (24/255,158/255,73/255, 1),
    'Government': (247/255,145/255,47/255, 1),
    'Residential': (21/255,118/255,187/255, 1),
    'Others': (50/255,50/255,50/255, 1)
}

climatezone_colors = {
    1: (165, 15, 15, 1),
    2: (242, 101, 34, 1),
    3: (76, 156, 81, 1),
    4: (89, 142, 189, 1),
    5: (0, 63, 100, 1)
}
for k in climatezone_colors:
    climatezone_colors[k] = tuple([v/255 for v in climatezone_colors[k][:3]]) + (1,)

def color(i, n=3, name='Blues', categorical=True, specific=None):
    if specific == 'Industry':
        return industry_colors[i]
    elif specific == 'Climatezone':
        return climatezone_colors[i]
    
    if categorical:
        return cm.get_cmap(name)((i+1)/n)
    else:
        return cm.get_cmap(name)(i)

meta_colors = ['Set1','tab20', 'seismic', 'Greys', 'hot']

def generate_heatmap2(df, k=3, square=False, individual=True, add_rows=False, dominant_to_left=False):
    # encode categorical variables
    meta_df = df.iloc[:, 3:-k-1]
    all_handles = []
    meta_width=50
    meta_width_total = meta_width * meta_df.shape[1]
    
    if square:
        N = int(df.shape[0] / 11 * 5)
        meta_width_total = int(N*0.7) + (meta_df.shape[1]-(int(N*0.7)%meta_df.shape[1]))
        meta_width = int(meta_width_total/meta_df.shape[1])
        heatmap_mat = np.zeros((df.shape[0]+k-1, N+1+meta_width_total, 4))
        if not add_rows:
            heatmap_mat = np.zeros((df.shape[0], N+1+meta_width_total, 4))
    else:
        N = 50
        heatmap_mat = np.zeros((df.shape[0]+k-1, N+1+meta_width_total, 4))
        if not add_rows:
            heatmap_mat = np.zeros((df.shape[0], N+1+meta_width_total, 4))

    df = df.copy()
    
    meta_lists = [[] for j in range(meta_df.shape[1])]
    for j in range(meta_df.shape[1]):
        if meta_df.columns[j] == 'Industry':
            meta_lists[j] = [color(val, name=meta_colors[j], specific='Industry') for val in meta_df.iloc[:, j]]
            handles = []
            for c in industry_colors:
                handles.append(mpatches.Patch(color=industry_colors[c], label=c))
            all_handles.append(handles)
        elif meta_df.columns[j] == 'Climatezone':
            meta_lists[j] = [color(val, name=meta_colors[j], specific='Climatezone') for val in meta_df.iloc[:, j]]
            handles = []
            for c in climatezone_colors:
                handles.append(mpatches.Patch(color=climatezone_colors[c], label=c))
            all_handles.append(handles)
        
        elif type(meta_df.iloc[0, j]) is str:
            vals, transform_map = encode(meta_df.iloc[:, j])
            max_val = vals.max()
            meta_lists[j] = [color(val, max_val+1, name=meta_colors[j]) for val in vals]
            
            handles = []
            for pair in zip(*transform_map):
                handles.append(mpatches.Patch(color=color(pair[0], max_val+1, name=meta_colors[j]), label=pair[1]))
            all_handles.append(handles)
        else:
            vals = minmax(meta_df.iloc[:, j])
            meta_lists[j] = [color(val, name=meta_colors[j]) for val in vals]

    ori_mat = df.as_matrix()
    
    if dominant_to_left:
        color_indices = []
        for i in range(ori_mat.shape[0]):
            proportions = ori_mat[i, -k:]
            indexed_proportions = [(proportions[idx], idx) for idx in range(len(proportions))]
            sorted_proportions = sorted(indexed_proportions, key=lambda x:x[0], reverse=True)
            color_indices.append([p[1] for p in sorted_proportions])
            df.iloc[i, -k:] = [p[0] for p in sorted_proportions]

    # convert to cumulative
    for i in range(1,k):
        df.iloc[:, -k+i] += df.iloc[:, -k+i-1]

    mat = df.as_matrix()
    start_k2 = df['k2'].iloc[0]
    add_rows = 0
    
    for i in range(mat.shape[0]):
        row = mat[i]
        stats =[0] + [round(val) for val in row[-k:] * N]
        k2 = row[-k-1]
        
        if start_k2 != k2:
            start_k2 = k2
            if add_rows:
                add_rows += 1
        
        if dominant_to_left:
            for j in range(len(stats)-1):
                heatmap_mat[i+add_rows, stats[j]:stats[j+1], :] = color(color_indices[i][j], k)
        else:
            for j in range(len(stats)-1):
                heatmap_mat[i+add_rows, stats[j]:stats[j+1], :] = color(j, k)
        
        for j in range(len(meta_lists)):
            heatmap_mat[i+add_rows, N+1+j*meta_width:N+1+j*meta_width+meta_width, :] = meta_lists[j][i]
    
    if individual:
        breaks = [-1] + [v[0] for v in np.argwhere(heatmap_mat[:,0,0] == 0)] + [heatmap_mat.shape[0]]
        return[[heatmap_mat[breaks[i]+1:breaks[i+1], :] for i in range(len(breaks)-1)], all_handles]

    return (heatmap_mat, all_handles)

def proc_continuous_meta(col):
    low = np.percentile(col, 10)
    high = np.percentile(col, 90)
    new_col = []
    for val in col.tolist():
        val = high if val > high else val
        val = low if val < low else val
        new_col.append(val)
    return pd.Series(new_col)

def reverse_data(col):
    max_val = col.max()
    min_val = col.min()
    new_col = []
    for val in col.tolist():
        val = min_val + (max_val-val)
        new_col.append(val)
    return pd.Series(new_col)

In [10]:
def create_heatmap_df_mix(heatmap_df, k):
    clusters_idx = list(range(k))

    heatmap_df_mix = heatmap_df.sort_values(clusters_idx[0], ascending=False)

    def intersect(l1,l2):
        return [i and j for i,j in zip(l1, l2)]

    filter_vals = [True for i in range(heatmap_df.shape[0])]
    for idx in range(len(clusters_idx)-1):
        i = clusters_idx[idx]
        next_i = clusters_idx[idx+1]
        filter_vals = intersect(filter_vals, (heatmap_df_mix[i] < .5).tolist())

        df_slice = heatmap_df_mix.loc[filter_vals, list(range(k))].copy()
        df_slice = df_slice.sort_values(next_i, ascending=False)
        for j in range(k):
            series = heatmap_df_mix.loc[:, j].copy()
            series[filter_vals] = df_slice.loc[:, j].tolist()
            heatmap_df_mix[j] = series

    heatmap_df_mix.reset_index(drop=True, inplace=True)

    heatmap_df_mix['Sqm'] = proc_continuous_meta(heatmap_df_mix['Sqm'])
    heatmap_df_mix['EUI'] = proc_continuous_meta(heatmap_df_mix['EUI'])

    # reorder columns
    new_columns = heatmap_df_mix.columns.tolist()
    new_columns.remove('Subindustry')
    new_columns.remove('Timezone')
    reordered_fields = ['Industry', 'PSU', 'Climatezone', 'Sqm', 'EUI']
    new_columns = new_columns[:3] + reordered_fields + new_columns[3+len(reordered_fields):]
    heatmap_df_mix = heatmap_df_mix[new_columns]
    
    return heatmap_df_mix

In [11]:
# optional (further investigation)
def sort_by_second_level(heatmap_df_mix, k):
    clusters_idx = list(range(k)) + [0] # loop back

    for idx in range(len(clusters_idx)-1):
        i = clusters_idx[idx]
        next_i = clusters_idx[idx+1]
        
        resort_vals = (heatmap_df_mix[i] >= .5).tolist()

        df_slice = heatmap_df_mix.loc[resort_vals, list(range(k))].copy()
        df_slice = df_slice.sort_values(next_i, ascending=False)
        
        for j in range(k):
            series = heatmap_df_mix.loc[:, j].copy()
            series[resort_vals] = df_slice.loc[:, j].tolist()
            heatmap_df_mix[j] = series

    heatmap_df_mix.reset_index(drop=True, inplace=True)
    
    return heatmap_df_mix

### plotting workflow

In [18]:
def plot_heatmap(profiles, k, algo, labels_dir, save_dir, option='show', second_level=False):
    
    profiles['cluster'] = np.load('./%s/%s/params[k=%d].npy % (labels_dir, algo, k))
    profiles['proportions'] = get_list_of_proportions(profiles)
    
    final_buildings = profiles.drop_duplicates(['Dataset', 'Building'])[['Dataset', 'Building', 'proportions', 'entropy', 'Industry', 'PSU', 'Sqm', 'Subindustry', 'Timezone', 'EUI', 'Climatezone']]
    final_buildings.reset_index(drop=True, inplace=True)
    
    filtered_buildings = final_buildings.loc[final_buildings.proportions.map(lambda x: x['count']) >= 30, :]
    filtered_buildings.reset_index(drop=True, inplace=True)
    
    cluster_labels = profiles.cluster.unique()
    cluster_labels.sort()
    
    proportions_data = []

    for dic in filtered_buildings.proportions.tolist():
        proportions_data.append([])
        for label in cluster_labels:
            if label not in dic:
                proportions_data[-1].append(0)
            else:
                proportions_data[-1].append(dic[label])

    proportions_df = pd.DataFrame(proportions_data)
    
    k2_model = KMeans(n_clusters=proportions_df.shape[1])
    k2_model.fit(proportions_df)
    
    heatmap_df = filtered_buildings.drop('proportions', axis=1)
    heatmap_df['k2'] = k2_model.labels_
    heatmap_df = pd.concat((heatmap_df, proportions_df), axis=1)
    heatmap_df = heatmap_df.sort_values(['k2', 'entropy'], ascending=[True, True])
    
    heatmap_df_mix = create_heatmap_df_mix(heatmap_df, k)
    if second_level:
        heatmap_df_mix = sort_by_second_level(heatmap_df_mix, k)
    
    color_name = 'Blues'
    width = 16

    heatmap, plot_handles = generate_heatmap2(heatmap_df_mix, individual=False, square=True, k=k, dominant_to_left=True)
    plt.figure(figsize=(width, width/heatmap.shape[1]*heatmap.shape[0]), dpi= 80, facecolor='w', edgecolor='w')
    plt.imshow(heatmap, cmap=color_name, interpolation='nearest')

    plt.xticks([])
    plt.yticks([])
    
    if option == 'show':
        plt.show()
    elif option == 'save':
        if not second_level:
            plt.savefig('./%s/%s/k_%d.png' % (save_dir, algo,k), bbox_inches='tight')
        else:
            plt.savefig('./%s/%s/k_%d (second level).png' % (save_dir, algo,k), bbox_inches='tight')
    else:
        raise Exception('Invalid option: %s' % option)
    plt.close()

### generate plots

In [19]:
settings_list = [
    {
        'profiles': combined_profiles,
        'labels_dir': 'final_labels',
        'save_dir': 'heatmaps/all'
    },
    {
        'profiles': residential_profiles,
        'labels_dir': 'residential_labels',
        'save_dir': 'heatmaps/residential'
    },
    {
        'profiles': non_residential_profiles,
        'labels_dir': 'non_residential_labels',
        'save_dir': 'heatmaps/non_residential'
    },
]
algo_list = ['kmeans', 'bisectingkmeans', 'gmm']
k_range = range(2,11)

In [1]:
for second_level in [True, False]:
    for settings in settings_list:
        data = settings['profiles']
        labels_dir = settings['labels_dir']
        save_dir = settings['save_dir']

        for heatmap_algo in algo_list:
            for k in k_range:
                plot_heatmap(data, k, heatmap_algo, labels_dir, save_dir, option='save', second_level=second_level)

### Save plot legends

In [649]:
counter = 0

for handles in plot_handles:
    counter += 1
    plt.figure(figsize=(16,.02), dpi= 80, facecolor='w', edgecolor='w')
    lgd = plt.legend(handles=handles, bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=min(5, len(handles)), mode="expand", borderaxespad=0.)
    plt.xticks([])
    plt.yticks([])
    plt.savefig('./heatmaps/legend_%d.png' % counter, bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close()

In [709]:
from numpy.random import randn

plt.figure(figsize=(16,6), dpi= 80, facecolor='w', edgecolor='w')
data = np.array([combined_profiles.EUI.min(), combined_profiles.EUI.max()]).reshape(1,-1)
cax = plt.imshow(data, interpolation='nearest', cmap=cm.hot)
cbar = plt.colorbar(cax, ticks=[combined_profiles.EUI.min(), combined_profiles.EUI.max()], orientation='horizontal', aspect=40)
# plt.show()
plt.savefig('./heatmaps/legend_4.png', bbox_inches='tight')
plt.close()

In [708]:
from numpy.random import randn

plt.figure(figsize=(16,6), dpi= 80, facecolor='w', edgecolor='w')
data = np.array([combined_profiles.Sqm.min(), combined_profiles.Sqm.max()]).reshape(1,-1)
cax = plt.imshow(data, interpolation='nearest', cmap=cm.Greys)
cbar = plt.colorbar(cax, ticks=[combined_profiles.Sqm.min(), combined_profiles.Sqm.max()], orientation='horizontal', aspect=40)
plt.savefig('./heatmaps/legend_5.png', bbox_inches='tight')
plt.close()