# Dominant Cluster Bar Chart

## Import libraries

In [45]:
# Built-in libraries
import os
import re
import time
from datetime import datetime
import pytz
from itertools import compress
from math import log
from math import log2
import random

# NumPy, SciPy and Pandas
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import iqr
import pandas as pd

# Scikit-Learn
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import sqeuclidean
from sklearn.neighbors import NearestNeighbors

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
%matplotlib inline

# JoyPy
import joypy

# Workalendar
from workalendar.europe import Switzerland
from workalendar.europe import UnitedKingdom
from workalendar.usa import Colorado
from workalendar.usa import NewYork
from workalendar.usa import California
from workalendar.usa import Arizona
from workalendar.usa import Illinois
from workalendar.asia import Singapore
from workalendar.oceania import WesternAustralia

In [46]:
combined_profiles = pd.read_csv('final_profiles.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [47]:
# Separate residential and non-residential buildings
is_residential = combined_profiles.Industry == 'Residential'
residential_profiles = combined_profiles.loc[is_residential, :]
non_residential_profiles = combined_profiles.loc[~is_residential, :]

# reset index
residential_profiles.reset_index(inplace = True, drop = True)
non_residential_profiles.reset_index(inplace = True, drop = True)

### helper functions

In [48]:
def get_proportions(pd_labels):
    result = {}
    vals = pd_labels.value_counts()
    total = pd_labels.shape[0]
    for i in vals.index:
        result[i] = (vals.loc[i]/total)
    result['count'] = total
    return result

def get_list_of_proportions(profiles):
    return profiles.groupby('Building')[['cluster']].transform(get_proportions)

In [49]:
def build_df(profiles, k, algo, labels_dir):
    profiles['cluster'] = np.load('./%s/%s/params[k=%d].npy' % (labels_dir, algo, k))
    profiles['proportions'] = get_list_of_proportions(profiles)
    
    final_buildings = profiles.drop_duplicates(['Dataset', 'Building'])[['Dataset', 'Building', 'proportions', 'Industry', 'PSU', 'Sqm', 'Subindustry', 'Timezone', 'EUI', 'Climatezone', 'EUI Binned', 'Sqm Binned']]
    final_buildings.reset_index(drop=True, inplace=True)
    
    filtered_buildings = final_buildings.loc[final_buildings.proportions.map(lambda x: x['count']) >= 30, :]
    filtered_buildings.reset_index(drop=True, inplace=True)
    
    cluster_labels = profiles.cluster.unique()
    cluster_labels.sort()
    
    proportions_data = []

    for dic in filtered_buildings.proportions.tolist():
        proportions_data.append([])
        for label in cluster_labels:
            if label not in dic:
                proportions_data[-1].append(0)
            else:
                proportions_data[-1].append(dic[label])

    proportions_df = pd.DataFrame(proportions_data)
    
    buildings_df = filtered_buildings.drop('proportions', axis=1)
    return (buildings_df, proportions_df)

### Plot stacked bar charts for dominant clusters of buildings by specific fields

In [50]:
from plot_functions.stacked import plot_stacked

In [51]:
def get_dominant_clusters(proportions_mat):
    dominant_flag = np.max(proportions_mat, axis=1) > .5
    dominant_clusters = np.argmax(proportions_mat, axis=1)
    dominant_clusters[~dominant_flag] = -1
    return dominant_clusters

def plot_buildings(buildings, proportions, field, proc_plot, plot_from_df=None):
    buildings['dominant_cluster'] = get_dominant_clusters(proportions.as_matrix())
    plot_stacked(field, buildings, by='dominant_cluster', proc_plot=proc_plot, plot_from_df=plot_from_df)

In [52]:
def get_counts(df):
    return df.as_matrix().tolist()

def get_label_counts(counts):
    return [sum(col) for col in transpose(counts)]

def convert_to_proportions(counts):
    results = []
    for row in counts:
        row_sum = sum(row)
        results.append([e / row_sum if row_sum != 0 else 0 for e in row])
    return results

def convert_to_cumulative(proportions):
    if len(proportions) == 0:
        return []
    results = [proportions[0]]
    for i in range(1, len(proportions)):
        results.append(proportions[i] + results[i-1])
    return results

def transpose(array):
    if len(array) == 0:
        return []
    return [[row[i] for row in array] for i in range(len(array[0]))]

def get_plot_sequences(proportions_list):
    cumulative = [convert_to_cumulative(prop) for prop in proportions_list]
    return transpose(cumulative)

colors = [(0,0,0, 1), (247/255,145/255,47/255, 1), (21/255,118/255,187/255, 1)]

# color func takes current group and index and returns a color
def plot_from_df(plt, df, color_func=None):
    by = [str(e) + ' ' for e in df.index.tolist()]
    groups = df.columns.tolist()
    counts = get_counts(df)
    label_counts = get_label_counts(counts)
    proportions = convert_to_proportions(counts)
    plot_sequences = get_plot_sequences(proportions)
    for i in range(len(plot_sequences)-1, -1, -1):
        seq = plot_sequences[i]
        if isinstance(label_counts[i], int):
            label_text = '%s (%d)' % (groups[i], label_counts[i])
        else:
            label_text = '%s (%.2f)' % (groups[i], label_counts[i])
        plt.barh(by, seq, label=label_text, color=color_func(groups, i) if color_func else None)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

* We want to inject color options based on field

In [53]:
industry_colors = {
    'Education': (24/255,158/255,73/255, 1),
    'Government': (247/255,145/255,47/255, 1),
    'Residential': (21/255,118/255,187/255, 1),
    'Others': (50/255,50/255,50/255, 1)
}

climatezone_colors = {
    1: (165/255, 15/255, 15/255, 1),
    2: (242/255, 101/255, 34/255, 1),
    3: (76/255, 156/255, 81/255, 1),
    4: (89/255, 142/255, 189/255, 1),
    5: (0, 63/255, 100/255, 1)
}

def get_color_func(cm, cmap_name):
    cmap = cm.get_cmap(cmap_name)
    def color_func(groups, i):
        return cmap((i+1)/len(groups))
    return color_func

def generate_plot_fn(field):
    def return_fn(plt, cm, pd, df):
        # sort df columns by names if columns names are intervals
        if re.search('Binned', field):
            df = df[sorted(df.columns, key=lambda x: float(re.search('(?<=\().*(?=,)', x).group(0)))]
        
        if field == 'Industry':
            color_func = lambda groups, i: industry_colors[groups[i]]
        elif field == 'Climatezone':
            color_func = lambda groups, i: climatezone_colors[groups[i]]
        elif field == 'EUI Binned':
            color_func = get_color_func(cm, 'hot')
        elif field == 'Sqm Binned':
            color_func = get_color_func(cm, 'Greys')
        elif field == 'PSU':
            color_func = get_color_func(cm, 'tab20')
        else:
            color_func = None
        plot_from_df(plt, df, color_func)

    return return_fn

### View buildings_df and proportions_df
* For testing purposes

In [54]:
# buildings_df, proportions_df = build_df(combined_profiles, 3, 'kmeans', 'final_labels')

In [55]:
# field_name = 'Industry'
# def proc_plot(plt):
#     plt.show()

# plot_buildings(buildings_df, proportions_df, field_name, proc_plot, generate_plot_fn(field_name))

In [56]:
# def proc_plot(plt):
#     plt.show()

# plot_buildings(buildings_df, proportions_df, 'PSU', proc_plot, plot_from_df)

In [57]:
# plot_from_df(plt, pd.DataFrame([[1, 2, 7], [3, 4, 2], [1, 0, 18]], index=['1', '2', '3'], columns=['a', 'b', 'c']))

### Actual Plotting

In [58]:
# Specify the root directory for all dominant cluster plots here
plot_root = './dominant_cluster'

fields = ['Industry', 'PSU', 'Timezone', 'Climatezone', 'EUI Binned', 'Sqm Binned']
algorithms = ['kmeans', 'bisectingkmeans', 'gmm']
k_range = range(2,11)
settings_list = [
    {
        'name': 'combined_profiles',
        'labels_dir': 'final_labels',
        'profiles': combined_profiles
    },
    {
        'name': 'residential_profiles',
        'labels_dir': 'residential_labels',
        'profiles': residential_profiles
    },
    {
        'name': 'non_residential_profiles',
        'labels_dir': 'non_residential_labels',
        'profiles': non_residential_profiles
    }
]

In [59]:
for settings in settings_list:
    profiles_name = settings['name']
    labels_dir = settings['labels_dir']
    profiles = settings['profiles']
    
    for algo in algorithms:
        for k in k_range:
            buildings_df, proportions_df = build_df(profiles, k, algo, labels_dir)
            
            for field in fields:
                save_dir = '%s/%s/%s/k%d' % (plot_root , profiles_name, algo, k)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                
                def proc_plot(plt):
                    plt.savefig('%s/%s.png' % (save_dir, field), bbox_inches='tight')

                plot_buildings(buildings_df, proportions_df, field, proc_plot, generate_plot_fn(field))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
