# Tufte Table

## Import libraries for analysis

In [1]:
# Built-in libraries
import time

# NumPy, SciPy and Pandas
import numpy as np
import pandas as pd

# Matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
combined_profiles = pd.read_csv('final_profiles.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Separate residential and non-residential buildings
is_residential = combined_profiles.Industry == 'Residential'
residential_profiles = combined_profiles.loc[is_residential, :]
non_residential_profiles = combined_profiles.loc[~is_residential, :]

# reset index
residential_profiles.reset_index(inplace = True, drop = True)
non_residential_profiles.reset_index(inplace = True, drop = True)

## Tufte table generation

In [31]:
def save_tufte_data(combined_profiles, plot_order, save_dir):
    labelled_mat = combined_profiles.as_matrix()
    cluster_ids = sorted(combined_profiles.cluster.unique())
    
    results = []

    for cid in cluster_ids:
        cluster_rows = labelled_mat[combined_profiles.cluster == cid, 3:(3+24)].astype(float)
        results.append(cluster_rows.mean(axis=0))
    
    np.save(save_dir + '/order_%d.npy' % plot_order, np.array(results))

In [23]:
def get_labels(plot_order, labels_dir):
    algo_list = ['kmeans', 'bisectingkmeans', 'gmm']
    current_row = (plot_order-1) // len(algo_list) + 1
    current_col = (plot_order-1) % len(algo_list)
    current_algo = algo_list[current_col]
    current_k = current_row + 1
#     return np.load('./%s/%s/params[k=%d].npy' % (labels_dir, current_algo, current_k))
    return np.load('./%s/%s/s1k%d.npy' % (labels_dir, current_algo, current_k))

In [9]:
# Switch order vertically for bisecting k-means
def is_row_in(row, rows, TOL):
    rows_list = [a_row for a_row in rows]
    for row_to_compare in rows:
        diff = ((row - row_to_compare)**2).sum()
        if diff < TOL:
            return True
    return False

def compare(before, after, TOL):
    new_rows_idx = []
    
    for i in range(after.shape[0]):
        row = after[i]
        if not is_row_in(row, before, TOL):
            new_rows_idx.append(i)
    return new_rows_idx

In [10]:
def bisect_tol(func, a, b, target):
    # assume solution between a and b
    # a = 0, b = 3 from testing
    # target = 2
    fa = func(a)
    fb = func(b)
    if fa == target:
        return a
    if fb == target:
        return b
    m = (a+b) / 2.0
    fm = func(m)
    if fm == target:
        return m
    
    if fm > target and fb < target:
        return bisect_tol(func, m, b, target)
    elif fm < target and fa > target:
        return bisect_tol(func, a, m, target)
    else:
        raise Exception('Should not occur')

In [11]:
def helper(before, after):
    def func(tol):
        return len(compare(before, after, tol))
    return func

In [12]:
# run [1]
def retrieve_thicknesses(labels_dir):

    def get_dict(col):
        vc_dict = {}
        vc = col.value_counts()
        for idx in vc.index:
            vc_dict[idx] = vc.loc[idx]
        return vc_dict

    def get_thickness(val, all_vals):
        std_val = np.std(all_vals)
        if std_val == 0:
            return 2.5
        mean_val = np.mean(all_vals)
        dev_val = (val-mean_val) / std_val
        dev_val = max(-1.5, min(dev_val, 5.5))
        return 2.5 + dev_val

    thicknesses_data = []
    for i in range(1, 3*9+1):
        labels = get_labels(i, labels_dir)
        cluster_counts = get_dict(pd.Series(labels))
        cluster_counts_as_list = list(cluster_counts.values())
        thicknesses = []
        for cid in sorted(set(labels)):
            thicknesses.append(get_thickness(cluster_counts[cid], cluster_counts_as_list))
        thicknesses_data.append(thicknesses)
    
    return thicknesses_data

In [25]:
# run [2]
def save_tufte_data_from_labels(profiles, labels_dir, save_dir):
    for i in range(1, 3*9+1):
        profiles['cluster'] = get_labels(i, labels_dir)
        save_tufte_data(profiles, i, save_dir = save_dir)

In [15]:
# bisecting k-means re-ordering

# run [3]
# returns new thicknesses_data
def vertical_reordering(thicknesses_data, save_dir):

    bisecting_k_means_idx = 2
    algo_num = 3
    num_k = 9

    def update_thicknesses(ref_thicknesses, old_rows, new_rows, delete_idx):
        thicknesses = []
        ref_copy = [val for val in ref_thicknesses]
        for i in range(len(ref_thicknesses)):
            min_idx = np.argmin(((old_rows[i, :] - new_rows) ** 2).sum(axis=1))
            thicknesses.append(ref_copy[min_idx])
            del ref_copy[min_idx]
            new_rows = np.delete(new_rows, min_idx, 0)
        return thicknesses

    for i in range(bisecting_k_means_idx, algo_num * num_k - algo_num, algo_num):
        # range that contains the tolerance level to identify the different row(s)
        a, b = 0, 3
        before = np.load('./%s/order_%d.npy' % (save_dir, i))
        after = np.load('./%s/order_%d.npy' % (save_dir, i+3))

        # find the tolerance level that extracts two new rows from after
        tol = bisect_tol(helper(before, after), a, b, 2)
        add_idx = compare(before, after, tol)

        # find the tolerance level that extracts one row that was broken down from before
        reverse_tol = bisect_tol(helper(after, before), a, b, 1)
        delete_idx = compare(after, before, reverse_tol)

        # replace the row to be deleted with the first row to be added
        # add the second row to the tail
        before[delete_idx, :] = after[add_idx[0], :]
        result_arr = np.concatenate((before, after[[add_idx[1]], :]))

        # modify thickness data
        new_thicknesses = update_thicknesses(thicknesses_data[i+3-1], result_arr, after, delete_idx)
        thicknesses_data[i+3-1] = new_thicknesses

        np.save('./%s/order_%d.npy' % (save_dir, i+3), result_arr)
    
    return thicknesses_data

In [16]:
# Switch order based on average profile distance to bisecting k-means results

# run [4]

def horizontal_reordering(thicknesses_data, save_dir):
    for i in range(1, 3*9+1):
        pivot = ((i-1) // 3) * 3 + 2
        if i == pivot:
            continue
        pivot_profile = np.load('./%s/order_%d.npy' % (save_dir, pivot))
        current_profile = np.load('./%s/order_%d.npy' % (save_dir, i))
        thicknesses = thicknesses_data[i-1]

        new_profile_data = []
        new_thicknesses = []
        for pivot_row in pivot_profile:
            min_idx = np.argmin(((pivot_row - current_profile) ** 2).sum(axis=1))
            new_profile_data.append(current_profile[min_idx, :])
            new_thicknesses.append(thicknesses[min_idx])

            # delete indices
            current_profile = np.delete(current_profile, min_idx, 0)
            del thicknesses[min_idx]

        np.save('./%s/order_%d.npy' % (save_dir, i), np.array(new_profile_data))
        thicknesses_data[i-1] = new_thicknesses
    
    return thicknesses_data

In [17]:
# Get ylim by finding the maximum value and minimum value for all the tufte data

# execute in final plotting procedure
def get_min_max(save_dir):
    max_val, min_val = 0, 0
    for i in range(1, 3*9+1):
        max_val = max(max_val, np.load('%s/order_%d.npy' % (save_dir, i)).max())
        min_val = min(min_val, np.load('%s/order_%d.npy' % (save_dir, i)).min())
    return (min_val, max_val)

In [18]:
def plot_tufte(tufte_data, thicknesses, colors, plot_order, ylim):
    # Remove the plot frame lines. They are unnecessary chartjunk.  
    ax = plt.subplot(9,3,plot_order)
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)

    # Limit the range of the plot to only where the data is.  
    # Avoid unnecessary whitespace.  
    plt.ylim(ylim[0], ylim[1])

    # Remove the tick marks
    plt.tick_params(axis="both", which="both", bottom="off", top="off",  
                    labelbottom="off", left="off", right="off", labelleft="off")

    for cluster_data, thickness, color in zip(tufte_data, thicknesses, colors): 
        plt.plot(cluster_data, c=color, lw=thickness)

In [19]:
def add_text(plot_order, algo_names, k_range):
    if plot_order <= len(algo_names):
        plt.title(algo_names[plot_order - 1])
    if ((plot_order - 1) % len(algo_names)) == 0:
        h = plt.ylabel('k=%d' % list(k_range)[(plot_order - 1) // len(algo_names)])
        h.set_rotation(0)

In [34]:
# run [5]

def execute_plot(save_dir, plot_dir):

    # These are the "Tableau 20" colors as RGB.  
    tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),  
                 (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),  
                 (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),  
                 (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),  
                 (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]  

    # Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.  
    for i in range(len(tableau20)):  
        r, g, b = tableau20[i]  
        tableau20[i] = (r / 255., g / 255., b / 255.)  

    tableau10 = [tableau20[i] for i in list(range(0, 20, 2))]

    # Letter size
    plt.figure(figsize=(17, 22))

    for i in range(1, 3*9+1):
        colors = tableau10[:len(thicknesses_data[i-1])]
        plot_tufte(np.load('./%s/order_%d.npy' % (save_dir, i)), thicknesses_data[i-1], colors, i, get_min_max(save_dir))
        add_text(i, ['K-Means', 'Bisecting K-Means', 'Gaussian Mixture Model'], range(2,11))

    plt.savefig('./tufte_results/%s' % plot_dir, bbox_inches='tight')
    plt.close()

### Plotting procedures

In [29]:
tufte_settings = [
    {
        'data': combined_profiles,
        'labels_dir': 'final_labels',
        'save_dir': 'tufte_data',
        'plot_dir': 'combined_plots'
    },
    {
        'data': residential_profiles,
        'labels_dir': 'residential_labels',
        'save_dir': 'residential_tufte_data',
        'plot_dir': 'residential_tufte_plots'
    },
    {
        'data': non_residential_profiles,
        'labels_dir': 'non_residential_labels',
        'save_dir': 'non_residential_tufte_data',
        'plot_dir': 'non_residential_tufte_plots'
    }
]

In [35]:
for settings in tufte_settings:
    data = settings['data']
    labels_dir = settings['labels_dir']
    save_dir = settings['save_dir']
    plot_dir = settings['plot_dir']

    thicknesses_data = retrieve_thicknesses(labels_dir)
    save_tufte_data_from_labels(data, labels_dir, save_dir)
    
    thicknesses_data = vertical_reordering(thicknesses_data, save_dir)
    thicknesses_data = horizontal_reordering(thicknesses_data, save_dir)
    
    execute_plot(save_dir, plot_dir)