# Contents
Different tests of the data reconciliation framework to test how robust it is to missing/bad data.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from os.path import join
import os
import pandas as pd
import numpy as np
import gridemissions
from gridemissions.load import BaData
from gridemissions.viz.reports import cleaning_plot
import matplotlib.dates as mdates

In [None]:
from gridemissions.viz import set_plots

COLORS, PAGE_WIDTH, ROW_HEIGHT = set_plots()

In [None]:
def make_plot1(raw_real, raw_fake, hist):
    f, ax = plt.subplots(figsize=(PAGE_WIDTH, ROW_HEIGHT))
    ax.plot(raw_real.df.loc[:, raw_real.get_cols("CISO", "D")] / 1e3, label="test-raw")
    ax.plot(raw_fake.df.loc[:, raw_fake.get_cols("CISO", "D")] / 1e3, label="test-fake")
    ax.plot(hist.df.loc[:, hist.get_cols("CISO", "D")] / 1e3, label="historical")
    ax.set_ylabel("GW")
    ax.set_title("CISO demand")
    ax.legend()
    f.autofmt_xdate()


def make_plot2(test_folder, pg_width, save=True, title="", bottom_ylim=None):
    # Reload data
    tmp_folder = join(gridemissions.config["APEN_PATH"], test_folder, "tmp")
    file_name = join(tmp_folder, "EBA_%s.csv")
    raw_fake = BaData(fileNm=file_name % "raw")
    preprocess = BaData(fileNm=file_name % "rolling")
    reconciled = BaData(fileNm=file_name % "opt")

    # Also reload real raw data
    file_name_raw = join(gridemissions.config["APEN_PATH"], "data", "EBA_raw.csv")
    raw_real = BaData(fileNm=file_name_raw)
    ba = "CISO"

    # Make plot
    f, ax = plt.subplots(figsize=(pg_width, ROW_HEIGHT))
    d_col = raw_fake.get_cols(ba, "D")[0]
    scale = 1e-3
    # Plot 1: demand at different steps

    ax.plot(
        raw_fake.df.loc[:, d_col] * scale,
        "-o",
        lw=1.0,
        ms=3,
        color=COLORS[0],
        label="raw (missing data)",
    )
    x1, x2 = raw_fake.df.index[0], raw_fake.df.index[-1]
    ax.plot(raw_real.df.loc[x1:x2, d_col] * scale, color=COLORS[0], lw=1.0, label="raw")

    ax.plot(
        preprocess.df.loc[:, d_col] * scale,
        color=COLORS[1],
        ls="--",
        lw=1.0,
        label="first guess",
    )
    ax.plot(
        reconciled.df.loc[:, d_col] * scale, color=COLORS[1], lw=1.0, label="reconciled"
    )

    if bottom_ylim is not None:
        ax.set_ylim(bottom=bottom_ylim)
    f.autofmt_xdate()
    ax.legend(loc=3, ncol=2, handlelength=1.5, columnspacing=1.0)
    ax.set_ylabel("GW")
    ax.set_title(title)

    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b-%d"))
    if save:
        f.savefig(
            join(gridemissions.config["APEN_PATH"], test_folder, f"{test_folder}.pdf")
        )

# Test 1 - remove two days' worth of CISO demand data
Test on first 5 days of September 2020.

In [None]:
# Load raw data and restrict to a five day test period
file_name_raw = join(gridemissions.config["APEN_PATH"], "data", "EBA_raw.csv")
data_raw = BaData(fileNm=file_name_raw)

start = pd.to_datetime("2020-09-01T00:00Z")
end = pd.to_datetime("2020-09-05T00:00Z")
data_raw.df = data_raw.df.loc[start:end]

# Create a copy of the test dataset and remove two days' worth of data
data_raw_copy = BaData(df=data_raw.df.copy(deep=True))
data_raw_copy.df.loc[
    start + pd.Timedelta("1D") : start + pd.Timedelta("3D"),
    data_raw_copy.get_cols("CISO", "D"),
] = np.nan

# Load historical data and restrict to 15 days before when we are testing
file_name_basic = join(gridemissions.config["APEN_PATH"], "data", "EBA_basic.csv")
data_basic = BaData(fileNm=file_name_basic)
end_hist = start
start_hist = end_hist - pd.Timedelta("15D")
data_basic.df = data_basic.df.loc[start_hist:end_hist]

In [None]:
make_plot1(data_raw, data_raw_copy, data_basic)

In [None]:
# Set up test folder and save data to the folder
tmp_folder = join(gridemissions.config["APEN_PATH"], "si_test1", "tmp")
folder_hist = join(gridemissions.config["APEN_PATH"], "si_test1", "hist")
os.makedirs(tmp_folder, exist_ok=True)
os.makedirs(folder_hist, exist_ok=True)

data_raw_copy.df.to_csv(join(tmp_folder, "EBA_raw.csv"))
data_basic.df.to_csv(join(folder_hist, "EBA_basic.csv"))

# Run workflow on fake dataset
from gridemissions.workflows import make_dataset

make_dataset(
    tmp_folder=tmp_folder,
    folder_hist=folder_hist,
    scrape=False,
)

In [None]:
# Make plot
make_plot2("si_test1", PAGE_WIDTH / 2)

# Test 2 - Remove 5 days' worth of CISO demand data
Test on the first ten days of November 2020

In [None]:
# Load raw data and restrict to a ten day test period
file_name_raw = join(gridemissions.config["APEN_PATH"], "data", "EBA_raw.csv")
data_raw = BaData(fileNm=file_name_raw)

start = pd.to_datetime("2020-11-01T00:00Z")
end = pd.to_datetime("2020-11-10T00:00Z")
data_raw.df = data_raw.df.loc[start:end]

# Create a copy of the test dataset and remove five days' worth of data
data_raw_copy = BaData(df=data_raw.df.copy(deep=True))
data_raw_copy.df.loc[
    start + pd.Timedelta("1D") : start + pd.Timedelta("6D"),
    data_raw_copy.get_cols("CISO", "D"),
] = np.nan

# Load historical data and restrict to 15 days before when we are testing
file_name_basic = join(gridemissions.config["APEN_PATH"], "data", "EBA_basic.csv")
data_basic = BaData(fileNm=file_name_basic)
end_hist = start
start_hist = end_hist - pd.Timedelta("15D")
data_basic.df = data_basic.df.loc[start_hist:end_hist]

In [None]:
make_plot1(data_raw, data_raw_copy, data_basic)

In [None]:
# Set up test folder and save data to the folder
tmp_folder = join(gridemissions.config["APEN_PATH"], "si_test2", "tmp")
folder_hist = join(gridemissions.config["APEN_PATH"], "si_test2", "hist")
os.makedirs(tmp_folder, exist_ok=True)
os.makedirs(folder_hist, exist_ok=True)

data_raw_copy.df.to_csv(join(tmp_folder, "EBA_raw.csv"))
data_basic.df.to_csv(join(folder_hist, "EBA_basic.csv"))

# Run workflow on fake dataset
from gridemissions.workflows import make_dataset

make_dataset(
    tmp_folder=tmp_folder,
    folder_hist=folder_hist,
    scrape=False,
)

In [None]:
# Make plot
make_plot2(
    "si_test2", PAGE_WIDTH, title="(a) Deleting 5 days in CISO demand", bottom_ylim=13
)

# Test 3 - Remove all data for CISO demand
Test on the first ten days of November 2020. Remove all data for CISO's demand. The heuristic to provide a first guess can only rely on data before the gap.

In [None]:
# Load raw data and restrict to a ten day test period
file_name_raw = join(gridemissions.config["APEN_PATH"], "data", "EBA_raw.csv")
data_raw = BaData(fileNm=file_name_raw)

start = pd.to_datetime("2020-11-01T00:00Z")
end = pd.to_datetime("2020-11-10T00:00Z")
data_raw.df = data_raw.df.loc[start:end]

# Create a copy of the test dataset and remove all demand data for CISO
data_raw_copy = BaData(df=data_raw.df.copy(deep=True))
data_raw_copy.df.loc[:, data_raw_copy.get_cols("CISO", "D")] = np.nan

# Load historical data and restrict to 15 days before when we are testing
file_name_basic = join(gridemissions.config["APEN_PATH"], "data", "EBA_basic.csv")
data_basic = BaData(fileNm=file_name_basic)
end_hist = start
start_hist = end_hist - pd.Timedelta("15D")
data_basic.df = data_basic.df.loc[start_hist:end_hist]

In [None]:
make_plot1(data_raw, data_raw_copy, data_basic)

In [None]:
# Set up test folder and save data to the folder
tmp_folder = join(gridemissions.config["APEN_PATH"], "si_test3", "tmp")
folder_hist = join(gridemissions.config["APEN_PATH"], "si_test3", "hist")
os.makedirs(tmp_folder, exist_ok=True)
os.makedirs(folder_hist, exist_ok=True)

data_raw_copy.df.to_csv(join(tmp_folder, "EBA_raw.csv"))
data_basic.df.to_csv(join(folder_hist, "EBA_basic.csv"))

# Run workflow on fake dataset
from gridemissions.workflows import make_dataset

make_dataset(
    tmp_folder=tmp_folder,
    folder_hist=folder_hist,
    scrape=False,
)

In [None]:
# Make plot
make_plot2(
    "si_test3",
    PAGE_WIDTH,
    title="(b) Deleting all new data in CISO demand",
    bottom_ylim=13,
)

# Test 4 - Add noise to CISO demand data
Add noise to demand data from CISO

Recreate one example and reload results generated with a script

In [None]:
import sys

print(sys.path)

In [None]:
from gridemissions.papers.physics_informed_cleaning_test import run_test

error, raw, basic, rolling, opt, data_raw = run_test(debug=True)

In [None]:
# Make plot
ba = "CISO"

f, ax = plt.subplots(1, 2, figsize=(PAGE_WIDTH, ROW_HEIGHT))
d_col = raw.get_cols(ba, "D")[0]
scale = 1e-3
start, end = raw.df.index[0], raw.df.index[-1]
ax[0].plot(
    data_raw.df.loc[start:end, d_col] * scale, color=COLORS[0], lw=1.0, label="raw"
)
ax[0].plot(
    rolling.df.loc[:, d_col] * scale,
    lw=1.0,
    ls="--",
    color=COLORS[1],
    label="first guess",
)
ax[0].plot(opt.df.loc[:, d_col] * scale, lw=1.0, color=COLORS[1], label="reconciled")

ax[0].set_ylim(bottom=0)
f.autofmt_xdate()
ax[0].legend(loc=3, ncol=2, handlelength=1.5, columnspacing=1.0)
ax[0].set_ylabel("GW")
ax[0].set_title("(a) 20% noise added to CISO demand")

# Reload results generated with the script
results = (
    pd.read_csv(
        join(gridemissions.config["APEN_PATH"], f"si_test4", "results.csv"), index_col=0
    )
    * 100
)
results.columns = results.columns.astype(float) * 100

ax[1].plot(results.mean(), "k")
ax[1].plot(results.mean() + results.std(), "k--")
ax[1].plot(results.mean() - results.std(), "k--")
ax[1].set_ylabel("Error (%)")
ax[1].set_xlabel("Noise (%)")
ax[1].set_title("(b) Robustness to noise in CISO demand")
f.tight_layout()

f.savefig(join(gridemissions.config["APEN_PATH"], f"si_test4", "si_test4.pdf"))