# Dominant Cluster Bar Chart

## Import libraries

In [1]:
# Built-in libraries
import os
import re
import time
from datetime import datetime
import pytz
from itertools import compress
from math import log
from math import log2
import random

# NumPy, SciPy and Pandas
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import iqr
import pandas as pd

# Scikit-Learn
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import sqeuclidean
from sklearn.neighbors import NearestNeighbors

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
%matplotlib inline

# JoyPy
import joypy

# Workalendar
from workalendar.europe import Switzerland
from workalendar.europe import UnitedKingdom
from workalendar.usa import Colorado
from workalendar.usa import NewYork
from workalendar.usa import California
from workalendar.usa import Arizona
from workalendar.usa import Illinois
from workalendar.asia import Singapore
from workalendar.oceania import WesternAustralia

In [2]:
combined_profiles = pd.read_csv('final_profiles.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Separate residential and non-residential buildings
is_residential = combined_profiles.Industry == 'Residential'
residential_profiles = combined_profiles.loc[is_residential, :]
non_residential_profiles = combined_profiles.loc[~is_residential, :]

# reset index
residential_profiles.reset_index(inplace = True, drop = True)
non_residential_profiles.reset_index(inplace = True, drop = True)

### helper functions

In [4]:
def get_proportions(pd_labels):
    result = {}
    vals = pd_labels.value_counts()
    total = pd_labels.shape[0]
    for i in vals.index:
        result[i] = (vals.loc[i]/total)
    result['count'] = total
    return result

def get_list_of_proportions(profiles):
    return profiles.groupby('Building')[['cluster']].transform(get_proportions)

In [5]:
def build_df(profiles, k, algo, labels_dir):
    profiles['cluster'] = np.load('./%s/%s/params[k=%d].npy' % (labels_dir, algo, k))
    profiles['proportions'] = get_list_of_proportions(profiles)
    
    final_buildings = profiles.drop_duplicates(['Dataset', 'Building'])[['Dataset', 'Building', 'proportions', 'Industry', 'PSU', 'Sqm', 'Subindustry', 'Timezone', 'EUI', 'Climatezone']]
    final_buildings.reset_index(drop=True, inplace=True)
    
    filtered_buildings = final_buildings.loc[final_buildings.proportions.map(lambda x: x['count']) >= 30, :]
    filtered_buildings.reset_index(drop=True, inplace=True)
    
    cluster_labels = profiles.cluster.unique()
    cluster_labels.sort()
    
    proportions_data = []

    for dic in filtered_buildings.proportions.tolist():
        proportions_data.append([])
        for label in cluster_labels:
            if label not in dic:
                proportions_data[-1].append(0)
            else:
                proportions_data[-1].append(dic[label])

    proportions_df = pd.DataFrame(proportions_data)
    
    buildings_df = filtered_buildings.drop('proportions', axis=1)
    return (buildings_df, proportions_df)

### View buildings_df and proportions_df

In [17]:
buildings_df, proportions_df = build_df(combined_profiles, 5, 'kmeans', 'final_labels')

In [18]:
buildings_df.head()

Unnamed: 0,Dataset,Building,Industry,PSU,Sqm,Subindustry,Timezone,EUI,Climatezone
0,pecan,3831,Residential,Single_family_house,105.909466,,America/Chicago,45.627032,2
1,pecan,3938,Residential,Single_family_house,104.051405,,America/Chicago,20.858344,2
2,pecan,5371,Residential,Single_family_house,87.050148,,America/Chicago,91.462989,2
3,pecan,9775,Residential,Single_family_house,130.528771,,America/Chicago,40.263767,2
4,pecan,5218,Residential,Single_family_house,193.14542,,America/Chicago,35.50675,2


In [19]:
proportions_df.head()

Unnamed: 0,0,1,2,3,4
0,0.207012,0.244853,0.42571,0.064552,0.057874
1,0.421888,0.152882,0.408521,0.000835,0.015873
2,0.063063,0.287087,0.487087,0.042643,0.12012
3,0.097458,0.095339,0.754944,0.014124,0.038136
4,0.290391,0.272959,0.350765,0.031888,0.053997


### Plot stacked bar charts for dominant clusters of buildings by specific fields

In [10]:
from plot_functions.stacked import plot_stacked

In [32]:
def get_dominant_clusters(proportions_mat):
    dominant_flag = np.max(proportions_mat, axis=1) > .5
    dominant_clusters = np.argmax(proportions_mat, axis=1)
    dominant_clusters[~dominant_flag] = -1
    return dominant_clusters

def plot_buildings(buildings, proportions, field, proc_plot):
    buildings['dominant_cluster'] = get_dominant_clusters(proportions.as_matrix())
    plot_stacked(field, buildings, by='dominant_cluster', proc_plot=proc_plot)

In [37]:
# Specify the root directory for all dominant cluster plots here
plot_root = './dominant_cluster'

fields = ['Industry', 'PSU', 'Timezone', 'Climatezone']
algorithms = ['kmeans', 'bisectingkmeans', 'gmm']
k_range = range(2,11)
settings_list = [
    {
        'name': 'combined_profiles',
        'labels_dir': 'final_labels',
        'profiles': combined_profiles
    },
    {
        'name': 'residential_profiles',
        'labels_dir': 'residential_labels',
        'profiles': residential_profiles
    },
    {
        'name': 'non_residential_profiles',
        'labels_dir': 'non_residential_labels',
        'profiles': non_residential_profiles
    }
]

In [39]:
for settings in settings_list:
    profiles_name = settings['name']
    labels_dir = settings['labels_dir']
    profiles = settings['profiles']
    
    for algo in algorithms:
        for k in k_range:
            buildings_df, proportions_df = build_df(profiles, k, algo, labels_dir)
            
            for field in fields:
                save_dir = '%s/%s/%s/k%d' % (plot_root , profiles_name, algo, k)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                
                def proc_plot(plt):
                    plt.savefig('%s/%s.png' % (save_dir, field), bbox_inches='tight')

                plot_buildings(buildings_df, proportions_df, field, proc_plot)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
