In [1]:
import math
import os
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import cm
import numpy as np
import pandas as pd
import seaborn as sns
from doe_xstock.database import SQLiteDatabase
from doe_xstock.utilities import split_lines, read_json

In [16]:
df = pd.DataFrame([[1,2,3,4], [2,2,2,2]])
display(df)
df = df.groupby(0).apply(lambda gr: pd.concat([gr.iloc[:,1:3],gr.iloc[:,3:]*10],axis=1)).reset_index()
display(df)

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,2,2,2,2


Unnamed: 0,0,level_1,1,2,3
0,1,0,2,3,40
1,2,1,2,2,20


In [7]:
def get_cohesion_score(x,labels):
    df = pd.DataFrame(x)
    df['label'] = labels
    df = df.groupby('label').apply(lambda gr:
        (gr.iloc[:,0:-1] - gr.iloc[:,0:-1].mean())**2
    )
    wss = df.sum().sum()
    return wss

SCHEDULE_CLUSTER_DATA_DIRECTORY = '../schedule_cluster_data'
data = pd.read_pickle(os.path.join(SCHEDULE_CLUSTER_DATA_DIRECTORY,f'baths.pkl'))
kmeans_result = read_json(os.path.join(SCHEDULE_CLUSTER_DATA_DIRECTORY,f'kmeans_result_without_norm.json'))
print('sklearn inertia:',kmeans_result['schedules']['baths']['scores']['sse'][0])
print('calculated inertia:',get_cohesion_score(data.values,kmeans_result['schedules']['baths']['labels'][0]))

sklearn inertia: 10715.26036337541
calculated inertia: 10715.26036337541


In [2]:
database_filepath = '/Users/kingsleyenweye/Desktop/INTELLIGENT_ENVIRONMENT_LAB/doe_xstock/database.db'
figures_directory = 'figures/'
schedule_data_directory = '../schedule_data'
database = SQLiteDatabase(database_filepath)

# Understanding the Metadata
***

In [51]:
# # numeric metadata
metadata = database.get_table('metadata')
columns_to_exclude = ['id', 'bldg_id', 'dataset_id','upgrade','metadata_index','in_county','in_puma','in_ashrae_iecc_climate_zone_2004', 'in_building_america_climate_zone', 'in_iso_rto_region', 'applicability', 'in_ahs_region', 'in_applicable','in_cec_climate_zone','in_census_division','in_census_division_recs','in_census_region','in_geometry_building_type_acs','in_geometry_building_type_height','in_geometry_building_type_recs','in_state','in_weather_file_longitude','in_weather_file_latitude','in_weather_file_city','in_nhgis_county_gisjoin','in_state_name','in_american_housing_survey_region','in_weather_file_2018','in_weather_file_tmy3','in_resstock_county_id','in_vacancy_status']
columns = [c for c in metadata.columns if c not in columns_to_exclude and pd.api.types.is_numeric_dtype(metadata[c])]
column_count = 6
row_count = math.ceil(len(columns)/column_count)
fig, axs = plt.subplots(row_count, column_count, figsize=(4*column_count,3*row_count))

for ax, column in zip(fig.axes, columns):
    ax.hist(metadata[column])
    ax.set_title(split_lines(column, line_character_limit=28,delimiter='_'))

plt.tight_layout()
plt.savefig(os.path.join(figures_directory,'travis_county_numeric_metadata_histogram.pdf'),transparent=True, bbox_inches='tight')
plt.close()

# non numeric metadata
columns = [c for c in metadata.columns if c not in columns_to_exclude and not pd.api.types.is_numeric_dtype(metadata[c])]
column_count = 4
row_count = math.ceil(len(columns)/column_count)
fig, axs = plt.subplots(row_count, column_count, figsize=(7*column_count,6*row_count))

for ax, column in zip(fig.axes, columns):
    plot_data = metadata.groupby(column).size().reset_index(name='count')
    x, y = list(range(plot_data.shape[0])), plot_data['count']
    ax.barh(x,y)
    ax.set_yticks(x)
    ax.set_yticklabels(plot_data[column].to_list())
    ax.set_title(split_lines(column, line_character_limit=28,delimiter='_'))

plt.tight_layout()
plt.savefig(os.path.join(figures_directory,'travis_county_non_numeric_metadata_histogram.pdf'),transparent=True, bbox_inches='tight')
plt.close()

In [3]:
# schedules
schedule_columns = database.query_table("""PRAGMA table_info(schedule)""")
schedule_columns = schedule_columns[~schedule_columns['name'].isin(['metadata_id','timestep'])]['name'].tolist()

for i, column in enumerate(schedule_columns):
    print(f'\r{i+1}/{len(schedule_columns)}',end='')
    database.query_table(f"""
    SELECT
        metadata_id,
        day*24 + hour AS timestep,
        day,
        hour,
        AVG({column}) AS {column}
    FROM (
        SELECT
            metadata_id,
            CAST((timestep - CAST(timestep/96 AS INTEGER)*96)/4 AS INTEGER) AS hour,
            CAST(timestep/96 AS INTEGER) AS day,
            {column}
        FROM schedule
    ) t
    GROUP BY
        metadata_id,
        day,
        hour
    """).to_pickle(os.path.join(schedule_data_directory,f'{column}.pkl'))

24/24