In [1]:
import re

import altair as alt
from altair import datum
import numpy as np
import pandas as pd

# Get Data 
## Extract Citation Numbers from Bibtex file 

In [2]:
p2bbl_file = r'main.bbl' # path to .bbl-file
with open(p2bbl_file, 'r') as fp:
    fContent = fp.read()
    
pattern = r'\\bibitem\[.*?\]\{(.+?)\}' # use raw string format 'r' -> see https://docs.python.org/3/howto/regex.html#the-backslash-plague
re.compile(pattern)
matchObj_list = re.findall(pattern, fContent, flags=re.DOTALL) # re.DOTALL is important because of newlines within brackets 
df_cit = pd.DataFrame({'Reference: Abbreviation': matchObj_list, 'Reference: Number': range(1,len(matchObj_list)+1)})
df_cit['Reference: Number'] = '[' + df_cit['Reference: Number'].astype(str) + ']'
df_cit['Reference: Abbreviation'] = df_cit['Reference: Abbreviation'].astype(str) 
# df_cit.head()

## Read Performance Values

In [3]:
path = r'DNN-NILM_low-freq_Performance.xlsx'
df = pd.read_excel(path)
unnamed_columns = []
for column in df.columns:
    if column.find('Unnamed:') != -1:
        unnamed_columns.append(column)
df = df.drop(columns=unnamed_columns)
df['Reference: Abbreviation'] = df['Reference: Abbreviation'].astype(str) 
# df.head()

In [4]:
# Sanitize data
# replace '?' with -1 values in case of 'Input: Window Size'
df.loc[ df['Input: Window Size'] == '?', 'Input: Window Size'] = -1
df.loc[ df['Input: Window Size'] == 'nan', 'Input: Window Size' ] = -1
df.loc[ df['Input: Window Size'] == 'not appl', 'Input: Window Size' ] = -1
df.loc[ df['Input: Window Size'] == np.nan, 'Input: Window Size'] = -1
df['Input: Window Size'] = df['Input: Window Size'].astype(float)

In [5]:
# df.columns

In [6]:
# Join dataframes with citation keys
df = df.merge(df_cit, on='Reference: Abbreviation', how='left')
df = df.rename(columns={'Reference: Number_y': 'Reference: Number'})

## Evaluation on Metrics for noised, unseen
(Done directly in Excel):
* classificaton
    * F1: 12 publications
* regression
    * MAE: 20 publications
    * SAE:  12 publications
    * EstAcc: 7 publications

# Investigate: MAE for Noised, Uneen Case

In [7]:
df_mae = df[df['Performance Metric: Type'] == 'MAE']
df_mae = df_mae[df_mae['Training: Type of Data'] == 'noised']
df_mae = df_mae[df_mae['Evaluation: Scenario'] == 'unseen']
df_mae = df_mae[['Reference: Abbreviation', 'Reference: Number', 
                 'Appliance', 'Training: Dataset', 'Model: Basic Type', 'Model: Denomination', 
                 'Input: Window Size', 'Input: Sampling Rate', 'Performance Metric: Value']].copy()

In [8]:
print(df_mae['Reference: Abbreviation'].unique())
print(len(df_mae['Reference: Abbreviation'].unique()))

['kelly2015' 'dincecco2019' 'shin2018' 'zhang2016' 'he2016' 'murray2019'
 'morgan2017' 'krystalakos2018' 'brewitt2018' 'jiang2019' 'sirojan2018'
 'shin2019a' 'alzeidi2018' 'barber2020' 'mottahedi2016' 'yue2020'
 'chen2020e' 'massidda2020' 'pan2020' 'rafiq2020' 'kukunuri2020']
21


In [9]:
# Remove Baseline Model 
df_mae = df_mae[ ~(df_mae['Model: Denomination'] == 'BL =0') ]
# Note to self: Ke15, Zh2016 are baseline/reference models -> remove these Models from other authors
df_mae = df_mae[ ~(df_mae['Model: Denomination'].str.contains('Ke15') | df_mae['Model: Denomination'].str.contains('Zh2016') | df_mae['Model: Denomination'].str.contains('He2016')) ] 

In [10]:
# Group Results
grouped = df_mae.groupby(['Reference: Abbreviation', 'Appliance', 'Training: Dataset'])
list_of_mins = []
for group in grouped:
    group = group[1]
    min_row = group[ group['Performance Metric: Value'] == group['Performance Metric: Value'].min()]
    list_of_mins.append(min_row)
df_mae = pd.concat(list_of_mins)
print(df_mae['Model: Denomination'].unique())
print(df_mae['Appliance'].unique())
print(df_mae['Training: Dataset'].unique())

['classification' 'Reduced+Dropout LMP' 'CNN fully' 'SCAnet' 'CNN s2p'
 'CNN Incpt-LSTM' 'CNN WaveNet' 'dAE' 'LSTM-bi' 'GRU-bi'
 'STL Normally Pruning (60%)' 'STL Rank 8 Tensor D.'
 'STL Normally Pruning (90%)' 'TP-NILM' 'CNN dAE-skip' 'GASF+CNN'
 'RNN-GRU' 'CNN' 'Unet-cGAN' 'MFS-LSTM' 'CNN SGN' 'CNN SGN-sp'
 'CNN HardSGN-sp' 'CNN HardSGN' 'SGN' 'CNN VAE' 'BERT' 'CNN s2s']
['dishwasher' 'fridge' 'kettle' 'microwave' 'washing machine' 'cooker'
 'shower' 'stove/oven' '(rice) cooker']
['UK-DALE' 'REFIT' 'IDEAL' 'REDD' 'proprietary: fridge only' 'DataPort'
 'Enertalk']


In [11]:
# Create column with concatenated Publication Name and Model Denomination
# df_mae['Referece_and_Model'] = df_mae['Reference: Abbreviation'].apply( lambda x: x[0:2]+x[-2:]+'_') + df_mae['Model: Denomination'].apply(lambda x: x.replace(' ', '-'))
# Create column containing year of publication
df_mae['Year'] = df_mae['Reference: Abbreviation'].apply( lambda x: x[-4:])
# df_mae['Referece_and_Model']
# Create column that combines 'Reference' and 'Number' to check that numbers are correct
df_mae['No-Ref'] = df_mae['Reference: Abbreviation'] + '_' + df_mae['Reference: Number']

In [12]:
datasets = df_mae['Training: Dataset'].unique()
for dataset in datasets:
    print(dataset, ': ', np.sum(df_mae['Training: Dataset']==dataset))

UK-DALE :  60
REFIT :  20
IDEAL :  6
REDD :  21
proprietary: fridge only :  1
DataPort :  2
Enertalk :  2


In [13]:
appliances = df_mae['Appliance'].unique()
for app in appliances:
    print(app, ': ', np.sum(df_mae['Appliance'] == app))

dishwasher :  24
fridge :  24
kettle :  16
microwave :  22
washing machine :  22
cooker :  1
shower :  1
stove/oven :  1
(rice) cooker :  1


### Remove special Cases 

In [14]:
# remove appliances with low number of results
# df_mae = df_mae[~(df_mae['Appliance'] == 'tumble dryer')]
# df_mae = df_mae[~((df_mae['Appliance'] == 'lighting'))]
df_mae = df_mae[~((df_mae['Appliance'] == 'cooker'))]
df_mae = df_mae[~((df_mae['Appliance'] == 'shower'))]
df_mae = df_mae[~((df_mae['Appliance'] == 'stove/oven'))]
df_mae = df_mae[~((df_mae['Appliance'] == '(rice) cooker'))]

# # remove Morgan2017 -> works on proprietary dataset & only fridge & best MAE ~64  
df_mae = df_mae[~(df_mae['Reference: Abbreviation'] == 'morgan2017') ]

In [15]:
datasets = df_mae['Training: Dataset'].unique()
for dataset in datasets:
    print(dataset, ': ', np.sum(df_mae['Training: Dataset']==dataset))

UK-DALE :  60
REFIT :  20
IDEAL :  4
REDD :  21
DataPort :  1
Enertalk :  1


In [16]:
# Average number of results per appliance
appliances = df_mae['Appliance'].unique()
n_apps = 0
for app in appliances:
    n_apps = n_apps + np.sum(df_mae['Appliance'] == app)
print(n_apps/len(appliances)/4)

5.35


In [17]:
df_mae.loc[df_mae['Training: Dataset'] == 'proprietary: fridge only']

Unnamed: 0,Reference: Abbreviation,Reference: Number,Appliance,Training: Dataset,Model: Basic Type,Model: Denomination,Input: Window Size,Input: Sampling Rate,Performance Metric: Value,Year,No-Ref


In [18]:
# find 'na' Reference: Numbers
# df['Reference: Number'].isna()
df['Reference: Abbreviation'][df['Reference: Number'].isna()].unique()

array([], dtype=object)

## Variant Overview 

In [19]:
# Visualization ,
upper_bound = 40 #14
sortByDataset = alt.Chart().mark_point(clip=True, size=90, filled=True).encode(
    x=alt.X('Training: Dataset:N', title='Dataset'),
    y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(0, upper_bound))),
#    color=alt.Color('Reference: Abbreviation:N', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')),
    color=alt.Color('No-Ref:N', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')),
#    color=alt.Color('Reference: Number:N', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')),
#    color=alt.Color('Input: Sampling Rate:Q', legend=alt.Legend(title='Sampling Rate'), scale=alt.Scale(domain=(0,10), type='symlog')),
#    color=alt.Color('Input: Window Size:Q', legend=alt.Legend(title='Window Size'), scale=alt.Scale(type='log')),
#    shape=alt.Shape('Model: Basic Type', sort=['recurrent', 'feedforward'], legend=alt.Legend(title='Network Type'))
)#.transform_filter(
#    alt.FieldOneOfPredicate(field='Training: Dataset', oneOf=['REDD', 'UK-DALE', 'REFIT'])
#)
# text = alt.Chart().mark_text(
#     clip=True, align='left', dy=-5, dx=-24,
#     fontSize=13
# ).encode(
#     x=alt.X('Training: Dataset:N'),
#     y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(0, upper_bound))),
#     text = 'Reference: Abbreviation:N'
# )

alt.layer(
    sortByDataset,  
    data=df_mae, width=90#, height=500
).facet(
    column=alt.Column('Appliance:N', title=None)
).configure_facet(
    spacing=7
).configure_header(
    titleFontSize=14,
    labelFontSize=14
)

In [20]:
# Individualized focus views
b = 20 
h = 250
params = {
    'Kettle':          { 'app' : 'kettle',          'width': b*3, 'height': h, 'upper_bound': 15, 'lower_bound': 0 }, 
    'Microwave':       { 'app' : 'microwave',       'width': b*5, 'height': h, 'upper_bound': 15, 'lower_bound': 0 }, 
    'Dishwasher':      { 'app' : 'dishwasher',      'width': b*4, 'height': h, 'upper_bound': 25, 'lower_bound': 0 },
    'Washing machine': { 'app' : 'washing machine', 'width': b*5, 'height': h, 'upper_bound': 25, 'lower_bound': 0 },
    'Fridge':          { 'app' : 'fridge',          'width': b*3, 'height': h, 'upper_bound': 25, 'lower_bound': 0 }, 
}

In [21]:
# Visualization Dishwasher,
layers = []
for key, par in params.items():
    upper_bound = par['upper_bound']
    lower_bound = par['lower_bound']
    width = par['width']
    height = par['height']
    app = par['app']
    sortByDataset = alt.Chart().mark_point(clip=True, size=90, filled=True).encode(
        x=alt.X('Training: Dataset:N', title='Dataset'),
        #y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(lower_bound, upper_bound))),
        y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(lower_bound, upper_bound))),
        color=alt.Color('Reference: Number:N', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')), 
        # color=alt.Color('No-Ref', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')), 
    ).transform_filter(
        alt.FieldOneOfPredicate(field='Appliance', oneOf=[app])
    ).properties(
        title=key
    )
#    text = alt.Chart().mark_text(
#        clip=True, align='left', dy=-7, dx=-15,
#        fontSize=13
#    ).encode(
#        x=alt.X('Training: Dataset:N'),
#        y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(lower_bound, upper_bound))),
#    #    text = 'Reference: Abbreviation:N'
#        text = 'Reference: Number:N'
#    ).transform_filter(
#        alt.FieldOneOfPredicate(field='Appliance', oneOf=[key])
#    )
    layers.append(alt.layer(
        sortByDataset, 
        data=df_mae, width=width, height=height
    ))
alt.hconcat(*layers, spacing=40, bounds='flush')

In [22]:
# VERSION FOR PUBLICATION
layers = []
upper_bound = 25
lower_bound = 0
app = [ 'kettle', 'microwave', 'dishwasher', 'washing machine', 'fridge' ]
sortByDataset = alt.Chart(df_mae).mark_point(clip=True, size=90, filled=True).encode(
    x=alt.X('Training: Dataset:N', title='Dataset'),
    #y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(lower_bound, upper_bound))),
    y=alt.Y('Performance Metric: Value:Q', title='MAE [W]', scale=alt.Scale(domain=(lower_bound, upper_bound))),
    color=alt.Color('Reference: Number:N', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')), 
    # color=alt.Color('No-Ref', legend=alt.Legend(title='Publication'), scale=alt.Scale(scheme = 'category20')), 
)


alt.layer(
    sortByDataset,  
    data=df_mae, width=90#, height=500
).facet(
    column=alt.Column('Appliance:N', title=None)
).configure_facet(
    spacing=7
).configure_header(
    titleFontSize=14,
    labelFontSize=14
)

Note to self: I double checked the values for the top result in the corresponding publications. *They are correct.*

In [23]:
# Evaluation

In [24]:
# Evaluate x best performing Models
apps = ['kettle', 'microwave', 'dishwasher', 'washing machine', 'fridge']
authors = []
for app in apps:
    df_app = df_mae[ df_mae['Appliance'] == app]
    df_app = df_app.sort_values('Performance Metric: Value')
    first_i_values = round(len(df_app)/4)
    df_app = df_app.iloc[0:first_i_values,:]
    df_app = df_app.iloc[0:first_i_values,:]
    df_app
    print(app)
    authors.extend(df_app['Reference: Abbreviation'].values)
    print(first_i_values)
    print('publications: ', df_app['Reference: Abbreviation'].values)
    print('values', df_app['Performance Metric: Value'].values)
    print('datasets: ', df_app['Training: Dataset'].unique())
    print('no of ff: ', np.sum(df_app['Model: Basic Type'] == 'feedforward'), 
          'no of rnn: ',  np.sum(df_app['Model: Basic Type'] == 'recurrent'), 
          'no of combined', np.sum(df_app['Model: Basic Type'] == 'combined') )   
    print()

kettle
4
publications:  ['rafiq2020' 'pan2020' 'krystalakos2018' 'jiang2019']
values [1.966 3.59  4.    4.726]
datasets:  ['UK-DALE' 'REFIT']
no of ff:  3 no of rnn:  0 no of combined 1

microwave
6
publications:  ['rafiq2020' 'pan2020' 'jiang2019' 'brewitt2018' 'shin2018' 'chen2020e']
values [0.392 3.14  3.686 4.4   4.53  4.82 ]
datasets:  ['UK-DALE' 'REFIT' 'IDEAL']
no of ff:  5 no of rnn:  0 no of combined 1

dishwasher
6
publications:  ['pan2020' 'chen2020e' 'brewitt2018' 'chen2020e' 'jiang2019'
 'dincecco2019']
values [ 4.8    8.71   8.9   10.14  10.296 12.26 ]
datasets:  ['REFIT' 'UK-DALE' 'IDEAL' 'REDD']
no of ff:  6 no of rnn:  0 no of combined 0

washing machine
6
publications:  ['jiang2019' 'yue2020' 'pan2020' 'alzeidi2018' 'massidda2020' 'chen2020e']
values [3.08  6.98  7.11  7.615 8.31  8.48 ]
datasets:  ['REFIT' 'UK-DALE']
no of ff:  6 no of rnn:  0 no of combined 0

fridge
6
publications:  ['he2016' 'murray2019' 'murray2019' 'shin2018' 'pan2020' 'chen2020e']
values [ 3.22

In [25]:
authors = np.asarray(authors)
counts = []
for author in pd.Series(authors).unique():
    count = np.sum(authors == author)
    counts.append(count)
counts = pd.DataFrame({'authors': pd.Series(authors).unique(), 'counts': counts})
counts.sort_values('counts', ascending=False)

Unnamed: 0,authors,counts
1,pan2020,5
6,chen2020e,5
3,jiang2019,4
0,rafiq2020,2
4,brewitt2018,2
5,shin2018,2
12,murray2019,2
2,krystalakos2018,1
7,dincecco2019,1
8,yue2020,1
