In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np

import utility_functions as fn

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# manually define building names
buildingnames = ['YUAG',
                 'Berkeley',
                 'Hopper',
                 '304Elm',
                 'Davenport',
                 '38HH',
                 '320Temple',
                 '53Wall',
                 'Sprague',
                 'Malone',
                 'Trumbull',
                 '17HH']

# Consumption Data Processing

In [3]:
# read from csv into a new dataframe
raw = pd.read_csv('energy_raw.csv',index_col=0,na_values=['#########'])

# reindex by appropriate datetime
raw.index = pd.to_datetime(raw.index,format='%a %m/%d/%y %H:00')

# add missing rows by full reindexing
correct_dt = pd.DatetimeIndex(start='2018-01-01 00:00:00',end='2018-07-27 23:00:00',freq='h')
raw = raw.reindex(index=correct_dt)

# remove built-in demand values, which tend to be bugged
raw = raw.drop(raw.columns[np.arange(0,len(buildingnames)*2,2)], axis=1)

# rename columns accordingly
raw.columns = buildingnames

In [4]:
# remove impossible outliers indicated by negative percent change
raw_head = raw.iloc[0]
raw = raw.where(raw.pct_change(limit=1)>0)
raw.iloc[0] = raw_head

# not currently implemented method to find outliers 
# tau = 0.0005 * (raw.max()-raw.min())
# raw = raw.where(raw.pct_change(limit=1)<tau)


Unnamed: 0,YUAG,Berkeley,Hopper,304Elm,Davenport,38HH,320Temple,53Wall,Sprague,Malone,Trumbull,17HH
2018-01-01 00:00:00,7929946.5,7241231.2,2766963.5,1109119.8,4306847.8,3164052.8,94018.2,3024576.8,358345.5,6543549.3,4441711.2,8929603.5
2018-01-01 01:00:00,7930103.6,7241300.8,2767051.1,1109124.4,4306942.8,3164055.9,94019.9,,358379.5,6543660.7,4441765.7,
2018-01-01 02:00:00,7930256.8,7241359.7,2767125.7,1109129.0,4307035.7,3164058.5,94021.8,3024609.2,358411.2,6543770.8,4441818.0,8929905.7
2018-01-01 03:00:00,7930426.2,7241423.3,2767205.3,1109133.8,4307137.8,3164061.8,94023.8,3024625.3,358443.5,6543892.0,4441878.8,8930055.8
2018-01-01 04:00:00,7930598.0,7241486.5,2767284.2,1109138.8,4307237.7,3164065.0,94025.7,3024641.7,358475.8,6544010.3,4441940.3,8930205.0
2018-01-01 05:00:00,7930770.2,7241549.5,2767363.2,1109143.7,4307338.7,3164068.0,94027.8,3024658.3,358508.3,6544130.7,4441999.5,8930354.5
2018-01-01 06:00:00,7930939.0,7241613.0,2767443.8,1109148.7,4307439.5,3164071.0,94029.8,3024674.5,358540.5,6544250.2,4442061.2,8930503.5
2018-01-01 07:00:00,7931107.8,7241676.0,2767522.5,1109153.7,4307540.3,3164074.5,94031.8,3024690.3,358572.8,6544368.2,4442124.5,8930653.0
2018-01-01 08:00:00,7931280.0,7241745.8,2767602.5,1109158.3,4307641.8,3164077.5,94034.0,3024706.7,358605.2,6544489.0,4442184.8,8930803.2
2018-01-01 09:00:00,7931481.0,7241824.5,2767683.0,1109163.5,4307741.3,3164080.5,94036.2,3024722.3,358636.7,6544614.8,4442244.5,8930957.5


In [None]:
# interpolate gaps in consumption data 6 hours and shorter
for k in raw.columns:
    raw[k] = fn.limited_impute(raw[k],6)
    

raw

# Demand Data Processing

In [None]:
# creates a new dataframe for the demand values, drops the first row
demand = raw.diff().drop(raw.index[0])

# saves head to replace later
demand_head = demand.iloc[0:4]

errors = demand.isnull().sum()
print(errors)

fn.plot_all(demand,'2018-01-01 01:00:00','2018-07-27 23:00:00')

In [None]:
# Remove huge statistical outliers
demand = demand.where(demand > demand.median() - 2.5*demand.std())
demand = demand.where(demand < demand.median() + 5*demand.std())

new_errors = demand.isnull().sum() - errors
print(new_errors)
errors = demand.isnull().sum()

fn.plot_all(demand,'2018-01-01 01:00:00','2018-07-27 23:00:00')

In [None]:
# iterative (SLOW) method to remove single spikes, unsure how helpful it is

# for k in demand.columns:
#     for i in range(len(demand)-1):
#         if (abs(demand.pct_change()[k][i]) > 0.5) & (abs(demand.pct_change()[k][i+1]) > 0.5):
#             demand[k][i] = np.NaN

            
new_errors = demand.isnull().sum() - errors
print(new_errors)
errors = demand.isnull().sum()

fn.plot_all(demand,'2018-01-01 01:00:00','2018-07-27 23:00:00')

In [None]:
# remove errors by rolling min and max within 10-day chunks
chunk_size = 360
i=0

while i < len(demand):
    end = i+chunk_size
    if end > len(demand): end = len(demand)
    demand[i:end].where(demand[i:end] > demand[i:end].rolling(18).min().median()*0.7, inplace=True)
    demand[i:end].where(demand[i:end] < demand[i:end].rolling(18).max().median()*1.3, inplace=True)
    i = i+chunk_size


new_errors = demand.isnull().sum() - errors
print(new_errors)
errors = demand.isnull().sum()

fn.plot_all(demand,'2018-01-01 01:00:00','2018-07-27 23:00:00')

# Imputation

In [None]:
dense_energy = demand.copy(deep=True)

# interpolate gaps shorter than 6 hours
for k in dense_energy.columns:
    dense_energy[k] = fn.limited_impute(dense_energy[k],6)
    
fn.plot_all(dense_energy,'2018-03-05 01:00:00','2018-03-10 23:00:00')

In [None]:
# interpolate gaps longer than 6 hours, linear for now, to be replaced by squared-sinusoidal
dense_energy.interpolate(method='linear',inplace=True)

fn.plot_all(dense_energy,'2018-03-05 01:00:00','2018-03-10 23:00:00')

In [None]:
# replace head for final export
dense_energy.iloc[0:4] = demand_head

# export to csv
dense_energy.round(1).to_csv('energy_clean.csv')

# unused code below

In [None]:
# # this cell removes all consumption values BELOW first value

# # creates dataframe of repeated minimum (first) values, pretty workaroundy
# raw_mins = raw.copy(deep=True)
# raw_mins.loc[:,:] = raw.loc['2018-01-01 00:00:00'].values

# # sets all violating values to NaN
# raw = raw.where(raw >= raw_mins)

# # this unused line was an attempt to find outliers using std ranges
# # raw = raw.where(raw > raw.median() - 2*raw.std()).where(raw < raw.median() + 2*raw.std())

# raw.isnull().sum()

In [None]:
# trying to plot daily average curves for buildings... matplotlib might support this somehow or i can write a fn

# raw = raw.iloc[:,[0]]

# raw['day'] = raw.index.day
# raw['hour'] = raw.index.hour

# raw_by_day = raw.resample('h').mean()
# raw_by_day = raw_by_day.set_index(['day','hour']).unstack('day')
# raw_by_day

In [None]:
# OLD ATTEMPT to find outliers using tuner values

# comp = demand.take([1], axis=1)
# comp['rolling'] = comp.loc[:,'Berkeley'].rolling(4,min_periods=1).median()
# # demand.rolling(4,min_periods=1).median()*0.5

# comp['rollmin'] = comp.loc[:,'Berkeley'].rolling(8,min_periods=6).min()

# # fn.plot_all(comp,'2018-03-14 00:00:00','2018-06-20 00:00:00')

# fn.plot_feature(comp,'rollmin','2018-01-02 00:00:00','2018-07-25 00:00:00')

# comp['rollmin'].median()-comp['rollmin'].std()

# max_tuner = 1 # here i want higher values clipping more points
# min_tuner = 1

# # demand.std()/demand.median() # here high values indicate volatile, 

# demand = demand.where(demand > demand.rolling(8,min_periods=4).min() - (min_tuner - demand.std()/demand.median()))
                      
# demand = demand.where(demand < demand.rolling(8,min_periods=4).max() + (max_tuner - demand.std()/demand.median()))

# fn.plot_all(demand,'2018-01-02 00:00:00','2018-07-25 00:00:00')
# demand.isnull().sum()

In [None]:
# iterative (bad performance) solution for single spikes, could work as supplement

# k='YUAG'

# for i in range(len(demand)-1):
#     if (abs(demand.pct_change()[k][i]) > 0.5) & (abs(demand.pct_change()[k][i+1]) > 0.5):
#         demand[k][i] = np.NaN

# fn.plot_all(demand,'2018-01-02 00:00:00','2018-07-25 00:00:00')
# demand.isnull().sum()

