In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Import Required Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axisartist.axislines import Subplot
from copy import copy, deepcopy
from pickle import dump, load
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/Load Demand Clustering Analysis/utils')
from utils import preProcessing_clustering

ModuleNotFoundError: ignored

# **Import Data**

In [None]:
file_path = '/content/drive/My Drive/Colab Notebooks/Load Demand Clustering Analysis/data/15minute_data_austin.csv'
df = pd.read_csv(file_path)
print(df)

        dataid       local_15min   grid  solar  solar2
0          661  01-01-2018 00:00    NaN    NaN     NaN
1          661  01-01-2018 00:15    NaN    NaN     NaN
2          661  01-01-2018 00:30    NaN    NaN     NaN
3          661  01-01-2018 00:45    NaN    NaN     NaN
4          661  01-01-2018 01:00  1.447 -0.002     NaN
...        ...               ...    ...    ...     ...
875888    9922  31-12-2018 22:45  1.238    NaN     NaN
875889    9922  31-12-2018 23:00  1.211    NaN     NaN
875890    9922  31-12-2018 23:15  1.045    NaN     NaN
875891    9922  31-12-2018 23:30  1.022    NaN     NaN
875892    9922  31-12-2018 23:45  1.205    NaN     NaN

[875893 rows x 5 columns]


# **Pre-Process Data**

1.   Group data by Household ID
2.   Convert DateTime to Pandas DateTime for coherence - then sort by DateTime
3.   Add NANs if DateTime is not continuous for any household
4.   Make NAN to 0 to handle missing data
5.   Add eGauge readings of 'grid', 'solar', and 'solar2' to calculate net electricity consumption reading of the household
6.   If net electricity consumption value is -ve or 0, consider it as missing value by converting it to NAN
7.   Group data for each household by date
8.   Return final rawData of shape (nHouseholds, nDays, readingsPerDay)

In [None]:
nHouseholds = len(df['dataid'].unique().tolist())
print(nHouseholds)
rawData = []
for householdID, gp in df.groupby('dataid'):
    gp['local_15min'] = pd.to_datetime(gp['local_15min'], format="%d-%m-%Y %H:%M")
    gp = gp.sort_values(by="local_15min", ascending=True)
    #print(gp)
    #####
    ## Making Date column continuous by appending NANs (if required)
    #####
    '''
    print(len(gp.index))
    flag = 0
    while flag == 0:
        print('Inside While LOOP...')
        prev_value = 0
        for key, value in gp['local_15min'].iteritems():
            flag = 1
            if not prev_value == 0:
                time_diff = (value - prev_value).total_seconds() / 60.0
                if not (time_diff == 15 or time_diff == 75 or time_diff == -45 or time_diff == 0): # If DST is applicable - ignore it by allowing 75 and -45 minute time-difference
                    err_key = key
                    print('Found error at key =', err_key, 'and time_diff =', time_diff)
                    print('Proof of error:', err_key, value, prev_value, time_diff)
                    gp = pd.concat([gp.iloc[:err_key], pd.DataFrame({'local_15min': prev_value + pd.Timedelta(pd.offsets.Minute(15))}, index=[err_key]), gp.iloc[err_key:]]).reset_index(drop=True)
                    prev_value = prev_value + pd.Timedelta(pd.offsets.Minute(15))
                    err_key += 1
                    time_diff -= 15
                    while not time_diff == 15:
                        gp = pd.concat([gp.iloc[:err_key], pd.DataFrame({'local_15min': prev_value + pd.Timedelta(pd.offsets.Minute(15))}, index=[err_key]), gp.iloc[err_key:]]).reset_index(drop=True)
                        prev_value = prev_value + pd.Timedelta(pd.offsets.Minute(15))
                        err_key += 1
                        time_diff -= 15
                    flag = 0
                    break
            prev_value = value
    print(len(gp.index))
    '''
    #####
    ## Finsihed making Date column continuous by appending NANs (if required)
    #####
    gp['grid'] = gp['grid'].fillna(0)
    gp['solar'] = gp['solar'].fillna(0)
    gp['solar2'] = gp['solar2'].fillna(0)
    gp['elecConsume'] = gp.apply(lambda row: row.grid + row.solar + row.solar2, axis = 1)
    gp.loc[~(gp['elecConsume'] > 0), 'elecConsume']=np.nan
    #nDays = int(len(gp.index)/96) # Because we should get 96 readings per day - as we have 15 minute resolution
    nDays = len(gp['local_15min'].dt.date.unique().tolist())
    householdData = np.zeros((nDays, 96))
    dayCounter = 0
    for gpdate, gp1 in gp.groupby([gp['local_15min'].dt.date]):
        if (len(gp1.loc[:,'elecConsume'].values)==96):
            householdData[dayCounter,:] = gp1.loc[:,'elecConsume'].values
        else:
            print(gpdate, len(gp1.loc[:,'elecConsume'].values))
        dayCounter += 1
    rawData.append(householdData)
    #counter += gp['elecConsume'].isna().sum()
rawData = np.array(rawData)
print(rawData.shape)

25
2018-03-11 92
2018-04-09 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-12-31 20
2018-03-11 92
2018-07-06 115
2018-07-07 162
2018-09-09 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-03-11 92
2018-01-01 88
2018-03-11 92
2018-03-11 92
2018-03-11 92
(25, 365, 96)


1.   Extract median daily profiles for each household
2.   Normalize the median profiles to complete pre-processing


In [None]:
processedData = preProcessing_clustering(rawData)
print(processedData.shape)

(25, 96)


# **Save the Preprocessed Data**

In [None]:
dump(processedData, open('/content/drive/My Drive/Colab Notebooks/Load Demand Clustering Analysis/data/preProcessed_austin.pkl', 'wb'))
dump(rawData, open('/content/drive/My Drive/Colab Notebooks/Load Demand Clustering Analysis/data/rawData_austin.pkl', 'wb'))