In [None]:
import os
import re
import glob
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.dates as mdates
import matplotlib.ticker as ticker

from matplotlib import cm

from models import estimate_knn_clusters, estimate_som_clusters, prepare_whole_year, prepare_TS
from utils.utils import generate_filename

from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from IPython.display import clear_output


from tslearn.barycenters import dtw_barycenter_averaging

import datetime

import geopandas as gpd

import plotly.express as px

px.set_mapbox_access_token("pk.eyJ1IjoiamdhY29zdGFzIiwiYSI6ImNsYWJwd3g1ZDAwaGUzb3Q0ZG04NDNndGgifQ.brk6kVA6biVSH0ovZ1dreA")

In [None]:
year=2019
df_total = pd.DataFrame()
for month in range(1, 13):
    if month < 10:
        file = f'{year}0{month}'
    else:
        file = f'{year}{month}'
    df_data = pd.read_csv(f'data/comed_month/comed_{file}.csv')
        # Filtering to get only residential accounts
    df_data = df_data[df_data.service_name.str.contains('RESIDENTIAL')]
    df_data['date_time'] = pd.to_datetime(df_data.date_time)
    # Getting only the desired day of the month
    df_data['date_time'] = df_data['date_time'].dt.month
    # Grouping observations at the zipcode level
    df_data = df_data.groupby(['zip5', 'date_time']).energy.sum().reset_index()
    df_total = pd.concat([df_total, df_data])

df_data = df_total
df_data = df_data.groupby(['zip5', 'date_time']).energy.sum().reset_index()

In [None]:
df_data['year'] = 2019

In [None]:
df_acs = pd.read_csv('data/census/Census_Clean_Zip5_IL_Sex_Age_Ethnicity_2018.csv')
dict_names = dict(zip(df_acs.columns, df_acs.loc[0].values))
df_acs = df_acs.drop(0)
df_acs['zip5'] = df_acs.NAME.str[6:]
df_acs['zip5'] = df_acs['zip5'].astype(int)

In [None]:
pd.merge(df_data, df_acs[['zip5', 'DP05_0001E']], on='zip5').rename(columns={'date_time': 'month', 'DP05_0001E': 'population'})[['zip5', 'year', 'month', 'population','energy']].to_csv('2019_COMED_zipcode5_consumption.csv', index=False)

In [None]:
df_data.rename(columns={'date_time': 'month'})[['zip5', 'year', 'month', 'energy']].to_csv('2019_COMED_zipcode5_consumption.csv', index=False)

In [None]:
year=2019
month=4
day=7
n_clusters=4
metric='euclidean'

if month == None:
    df_data, mySeries, namesofMySeries = prepare_whole_year(year)
else:
    df_data, mySeries, namesofMySeries = prepare_TS(year=year, month=month, day=day)

best_n_cluster = n_clusters
km = TimeSeriesKMeans(n_clusters=best_n_cluster, metric=metric, n_init=5, random_state=1234)
labels = km.fit_predict(mySeries)
filename = generate_filename(year, month, day)


In [None]:
for label in set(labels):
    dates = df_data.date_time.unique()
    fig, ax = plt.subplots(figsize=(12, 6))
    for i in range(len(labels)):
        if(labels[i]==label):
            cluster = []
            ax.plot(dates[1:], mySeries[i][1:],c="gray",alpha=0.2)
            cluster.append(mySeries[i])
    if len(cluster) > 0:
            ax.plot(dates[1:], dtw_barycenter_averaging(np.vstack(cluster))[1:],c="red")

    ax.xaxis.set_major_formatter(
        mdates.ConciseDateFormatter(ax.xaxis.get_major_locator()))

    ax.set_ylabel(r'Normalized energy consumption')
    ax.grid(linewidth=0.25)
    filename = generate_filename(year, month, day)
    if day != None:
         folder = 'day'
    elif month != None:
         folder = 'month'
    else:
         folder = 'year'
    #fig.savefig(f'outputs/plots/{folder}/{filename}_TimeSeries_pattern_cluster{label+1}.png')

# Inertias plot

In [None]:
inertias_files = glob.glob('outputs/*plot.csv')
inertias_file = inertias_files[-1]

df = pd.read_csv(inertias_file)

In [None]:
colors = cm.get_cmap('Dark2')

fig, ax = plt.subplots(figsize=(12, 6))
df = pd.read_csv('outputs/2019_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='2019', linestyle="-",marker=".", color=colors(1))
df = pd.read_csv('outputs/20191_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='January 2019', linestyle="-",marker=".", color=colors(2))
df = pd.read_csv('outputs/20194_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='April 2019', linestyle="-",marker=".", color=colors(3))
df = pd.read_csv('outputs/20197_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='July 2019', linestyle="-",marker=".", color=colors(4))
ax.grid(linewidth=0.25)
ax.legend()
ax.set_ylabel('Inertia')
ax.set_xlabel('Number of clusters')

#fig.savefig(f'outputs/plots/year_month_elbow_plot.png')

In [None]:
colors = cm.get_cmap('Dark2')

fig, ax = plt.subplots(figsize=(12, 6))
df = pd.read_csv('outputs/2019130_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='January 30th', linestyle="-",marker=".", color=colors(1))
df = pd.read_csv('outputs/201948_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='April 8th', linestyle="-",marker=".", color=colors(2))
df = pd.read_csv('outputs/2019710_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='July 10th', linestyle="-",marker=".", color=colors(3))
ax.grid(linewidth=0.25)
ax.legend()
ax.set_ylabel('Inertia')
ax.set_xlabel('Number of clusters')

fig.savefig(f'outputs/plots/workdays_elbow_plot.png')

In [None]:
colors = cm.get_cmap('Dark2')

fig, ax = plt.subplots(figsize=(12, 6))
df = pd.read_csv('outputs/2019126_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='January 26th', linestyle="-",marker=".", color=colors(1))
df = pd.read_csv('outputs/201947_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='April 7th', linestyle="-",marker=".", color=colors(2))
df = pd.read_csv('outputs/2019720_inertias_plot.csv')
ax.plot(df['clusters'], df['scld_intertia_0.04'], label='July 20th', linestyle="-",marker=".", color=colors(3))
ax.grid(linewidth=0.25)
ax.legend()
ax.set_ylabel('Inertia')
ax.set_xlabel('Number of clusters')

fig.savefig(f'outputs/plots/weekends_elbow_plot.png')

Whole year: 2019
Important months: January, July, April
Week: July 15-21, January 28-Feb3, April 15-21
Day: 
    work: January 30, July 10, 8 April
    weekend: January 26, July 20, 7 April






# Maps

In [None]:
gdf_zc = gpd.read_file('data/geo/tl_2020_us_zcta520/tl_2020_us_zcta520.shp')
gdf_zc['GEOID20'] = gdf_zc['GEOID20'].astype(str)


In [None]:
dict_runs = {
    1: {'year': 2019, 'month':None, 'day':None, 'n_clusters':5, 'metric': 'euclidean', 'day_type': None},
    2: {'year': 2019, 'month':1, 'day':None, 'n_clusters':5, 'metric': 'euclidean', 'day_type': None},
    3: {'year': 2019, 'month':4, 'day':None, 'n_clusters':5, 'metric': 'euclidean', 'day_type': None},
    4: {'year': 2019, 'month':7, 'day':None, 'n_clusters':6, 'metric': 'euclidean', 'day_type': None},
    5: {'year': 2019, 'month':1, 'day':30, 'n_clusters':6, 'metric': 'euclidean', 'day_type': 'workday'},
    6: {'year': 2019, 'month':4, 'day':8, 'n_clusters':5, 'metric': 'euclidean', 'day_type': 'workday'},
    7: {'year': 2019, 'month':7, 'day':10, 'n_clusters':5, 'metric': 'euclidean', 'day_type': 'workday'},
    8: {'year': 2019, 'month':1, 'day':26, 'n_clusters':5, 'metric': 'euclidean', 'day_type': 'weekend'},
    9: {'year': 2019, 'month':4, 'day':7, 'n_clusters':4, 'metric': 'euclidean', 'day_type': 'weekend'},
    10: {'year': 2019, 'month':7, 'day':20, 'n_clusters':6, 'metric': 'euclidean', 'day_type': 'weekend'}

}

'''
Whole year: 2019
Important months: January, July, April
Week: July 15-21, January 28-Feb3, April 15-21
Day: 
    work: January 30, 8 April, July 10
    weekend: January 26, 7 April, July 20,
'''

In [None]:
for key in dict_runs.keys():
        print(key)
        year=dict_runs[key]['year']
        month=dict_runs[key]['month']
        day=dict_runs[key]['day']
        n_clusters=dict_runs[key]['n_clusters']
        metric=dict_runs[key]['metric']

        if month == None:
                df_data, mySeries, namesofMySeries = prepare_whole_year(year)
        else:
                df_data, mySeries, namesofMySeries = prepare_TS(year=year, month=month, day=day)

        best_n_cluster = n_clusters
        km = TimeSeriesKMeans(n_clusters=best_n_cluster, metric=metric, n_init=5, random_state=1234)
        labels = km.fit_predict(mySeries)
        filename = generate_filename(year, month, day)

        if day != None:
                folder = 'day'
        elif month != None:
                folder = 'month'
        else:
                folder = 'year'

        fancy_names_for_labels = [f"Cluster {label+1}" for label in labels]
        df_labels = pd.DataFrame(zip(namesofMySeries,fancy_names_for_labels),columns=["zip5","Cluster"]).sort_values(by="Cluster").set_index("zip5").reset_index()
        df_labels['zip5'] = df_labels.zip5.astype(str)
        gdf_data = pd.merge(gdf_zc, df_labels, left_on='GEOID20', right_on='zip5')
        gdf_data = gdf_data[['GEOID20', 'Cluster', 'geometry']]
        df_labels.to_csv(f'outputs/clusters/{filename}.csv', index=False)
        fig = px.choropleth_mapbox(gdf_data,
                        geojson=gdf_data,
                        featureidkey='properties.GEOID20',
                        color="Cluster",
                        locations='GEOID20',
                        width=1000,
                        height=700,
                        center={'lat':41.6, 'lon':-88.99},
                        zoom=6,
                        mapbox_style='carto-positron',
                        opacity=0.65,
                        )
        fig.update_geos(fitbounds="locations", visible=False)
        fig.write_html(f'outputs/plots/{folder}/{filename}_map.html')

Whole year: 2019
Important months: January, July, April
Week: July 15-21, January 28-Feb3, April 15-21
Day: 
    work: January 30, 8 April, July 10
    weekend: January 26, 7 April, July 20,



In [None]:
fig = px.choropleth_mapbox(gdf_data,
                   geojson=gdf_data,
                   featureidkey='properties.GEOID20',
                   color="Cluster",
                   locations='GEOID20',
                   width=1000,
                   height=700,
                   center={'lat':41.6, 'lon':-88.99},
                   zoom=6,
                   mapbox_style='carto-positron',
                   opacity=0.65,
                   )
fig.update_geos(fitbounds="locations", visible=False)
fig.write_html(f'outputs/plots/{folder}/{filename}_map.html')
#fig.show()