In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

In [None]:
from sklearn.cluster import KMeans

In [None]:
df_prorated_imputed = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_prorated_kwh_imputed")
df_prorated_imputed.head()

In [None]:
df_one_building = df_prorated_imputed[df_prorated_imputed['Building_Meter']=='165.0 - BLD 03_90327795']

In [None]:
df_one_building = df_one_building.reset_index()

In [None]:
def sliding_chunker(data, window_len, slide_len):
    """
    Split a list into a series of sub-lists, each sub-list window_len long,
    sliding along by slide_len each time. If the list doesn't have enough
    elements for the final sub-list to be window_len long, the remaining data
    will be dropped.
    e.g. sliding_chunker(range(6), window_len=3, slide_len=2)
    gives [ [0, 1, 2], [2, 3, 4] ]
    """
    chunks = []
    for pos in range(0, len(data), slide_len):
        chunk = np.copy(data[pos:pos+window_len])
        if len(chunk) != window_len:
            continue
        chunks.append(chunk)

    return chunks

In [None]:
def plot_waves(waves, step):
    """
    Plot a set of 9 waves from the given set, starting from the first one
    and increasing in index by 'step' for each subsequent graph
    """
    plt.figure()
    n_graph_rows = 3
    n_graph_cols = 3
    graph_n = 1
    wave_n = 0
    for _ in range(n_graph_rows):
        for _ in range(n_graph_cols):
            axes = plt.subplot(n_graph_rows, n_graph_cols, graph_n)
            axes.set_ylim([min(df_one_building['Imputed_KWH'])-10000, max(df_one_building['Imputed_KWH'])+10000])
            plt.plot(waves[wave_n])
            graph_n += 1
            wave_n += step
    # fix subplot sizes so that everything fits
    plt.suptitle('Waveform Segments of 8 data points')
    plt.tight_layout(pad=2,h_pad=1)
    plt.show()

In [None]:
def reconstruct(data, window, clusterer):
    """
    Reconstruct the given data using the cluster centers from the given
    clusterer.
    """
    window_len = len(window)
    slide_len = window_len/2
    segments = sliding_chunker(data, window_len, slide_len)
    reconstructed_data = np.zeros(len(data))
    for segment_n, segment in enumerate(segments):
        # window the segment so that we can find it in our clusters which were
        # formed from windowed data
        segment *= window
        nearest_match_idx = clusterer.predict(segment)[0]
        nearest_match = np.copy(clusterer.cluster_centers_[nearest_match_idx])

        pos = segment_n * slide_len
        reconstructed_data[pos:pos+window_len] += nearest_match

    return reconstructed_data

In [None]:
segment_len = 8
slide_len = 1

segments = []
for start_pos in range(0, len(df_one_building['Imputed_KWH']), slide_len):
    end_pos = start_pos + segment_len
    # make a copy so changes to 'segments' doesn't modify the original data
    segment = np.copy(df_one_building['Imputed_KWH'][start_pos:end_pos])
    # if we're at the end and we've got a truncated segment, drop it
    if len(segment) != segment_len:
        continue
    segments.append(segment)

print("Produced %d waveform segments" % len(segments))

In [None]:
plot_waves(segments, step=2)

In [None]:
clusterer = KMeans(n_clusters=12)
clusterer.fit(segments)

In [None]:
plot_waves(clusterer.cluster_centers_, step=1)

In [None]:
slide_len = 1
test_segments = sliding_chunker(
    df_one_building['Imputed_KWH'],
    window_len=segment_len,
    slide_len=slide_len
)

In [None]:
centroids = clusterer.cluster_centers_

segment = np.copy(test_segments[16])
# predict() returns a list of centres to cope with the possibility of multiple
# samples being passed
nearest_centroid_idx = clusterer.predict(test_segments[16].reshape(1,-1))[0]
nearest_centroid = np.copy(centroids[nearest_centroid_idx])
plt.figure()
plt.plot(segment, label="Original segment");
plt.plot(nearest_centroid, label="Nearest centroid");
plt.title('Comparison of original and predicted at index 8');
plt.xlabel('Index within each segment');
plt.ylabel('Imputed KWH Consumption');
plt.legend();
plt.show();

### Reconstruction

In [None]:
data = df_one_building['Imputed_KWH']
reconstruction = np.zeros(len(data))


for segment_n, segment in enumerate(test_segments):
    # don't modify the data in segments
    segment = np.copy(segment)
    nearest_centroid_idx = clusterer.predict(segment.reshape(1,-1))[0]
    centroids = clusterer.cluster_centers_
    nearest_centroid = np.copy(centroids[nearest_centroid_idx])
    
    # overlay our reconstructed segments with an overlap of half a segment
    pos = int(segment_n * slide_len)
    reconstruction[pos:pos+segment_len] += nearest_centroid/(segment_len/slide_len)
#     if 8 >= pos and 8 < pos+segment_len:
#          plt.plot(np.linspace(0,7,8)+pos, nearest_centroid,label = pos)

# fix first segment_len and last segment_len data points since they are not modeled segment_len/slide_len times
for i in np.linspace(0,segment_len-1,segment_len).astype(int):
    reconstruction[i] = reconstruction[i]/(i+1)*(segment_len/slide_len)
    reconstruction[-i -1 ] = reconstruction[-i - 1]/(i+1)*(segment_len/slide_len)

n_plot_samples = len(data)
error = reconstruction[0:n_plot_samples] - data[0:n_plot_samples]
error_99th_percentile = np.percentile(error, 99)
print("Maximum reconstruction error was %.1f" % error.max())
print("99th percentile of reconstruction error was %.1f" % error_99th_percentile)

figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

plt.plot(data[0:n_plot_samples], label="Original_Data")
plt.plot(reconstruction[0:n_plot_samples], label="Reconstructed_Value")
plt.plot(np.abs(error[0:n_plot_samples]), label="Abs Reconstruction_Error")
plt.title('Reconstructed Data vs. Original Data');
plt.xlabel('Index within each account');
plt.ylabel('Imputed KWH Consumption');
plt.legend();
plt.show();

In [None]:
plt.plot(np.abs(error[0:n_plot_samples]), label="Reconstruction Error")
plt.axhline(y=error_99th_percentile,linestyle='--',color='gray');
plt.title('Abs Reconstruction error with 99th percentile error threshold');
plt.xlabel('Index within each account');
plt.ylabel('Reconstruction_Error');
plt.show();

In [None]:
anomaly_entries = df_one_building[np.abs(error[0:n_plot_samples])>error_99th_percentile]
anomaly_entries

In [None]:
df_one_building['Anomaly'] = np.where(np.abs(error[0:n_plot_samples])>error_99th_percentile, 'Yes', 'No')

In [None]:
output = df_one_building[df_one_building['Anomaly']=='Yes'][['Building_Meter','Month','Month_Type','Imputed_KWH','Anomaly']]

In [None]:
output['Reconstruction_Error'] = error[df_one_building['Anomaly']=='Yes']
output['Reconstructed_Value'] = reconstruction[df_one_building['Anomaly']=='Yes']

In [None]:
output[['Building_Meter','Month','Imputed_KWH','Reconstructed_Value','Reconstruction_Error','Anomaly']]