<a href="https://colab.research.google.com/github/jrbalderrama/a2r2/blob/main/notebooks/a2r2-03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RUDI Workshop: Introduction to Privacy-Preserving Data Publishing Techniques

Tristan ALLARD & Javier ROJAS BALDERRAMA

_Univ Rennes, CNRS, INRIA_
  
This work is licensed under a [Creative Commons Zero v1.0 Universal License](https://creativecommons.org/publicdomain/zero/1.0/)

# Notebook __THREE__: Protection with differential privacy

## Step 0 (PREAMBLE): Settings and data


 ### Download datasets


In [None]:
!wget -nv -nc https://zenodo.org/record/5509313/files/classes.parquet
!wget -nv -nc https://zenodo.org/record/5509268/files/buses.parquet

 ### Import required modules

In [None]:

import importlib
import itertools
import math
import os
from datetime import datetime
from errno import ENOENT
from pathlib import Path
from typing import Optional, Sequence, Tuple

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import pyarrow.parquet as pq
import torch
from IPython import display, get_ipython
from numpy import linalg, ndarray
from pandas import NA, DataFrame, DatetimeIndex, Series, Timedelta, Timestamp
from plotly import subplots
from plotly.graph_objs import Bar, Candlestick, Figure, Scatter
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from torch import Tensor
from torch.nn import LSTM, Linear, Module, MSELoss
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset


### Setup notebook constants and running environment

In [None]:
# project base directory
BASE_DIRECTORY = Path(".")

# detect running environment
COLAB_ON = True if "google.colab" in str(get_ipython()) else False

In [None]:
# Set Ploty renderer
if COLAB_ON:
    pio.renderers.default = "colab"

### Load and display raw datasets

In [None]:
# load dataset from file system
def load_data(
    path: Path,
) -> DataFrame:
    if not path.exists():
        raise FileNotFoundError(ENOENT, os.strerror(ENOENT), path)

    table = pq.read_table(path)
    return table.to_pandas()


# show a dataframe as a table
def display_dataframe(
        dataframe: DataFrame,
) -> None:    
    if COLAB_ON:
        spec = importlib.util.find_spec("google.colab")
        if spec:            
            data_table = importlib.import_module("google.colab.data_table")            
            enable_dataframe_formatter = getattr(
                data_table, 
                "enable_dataframe_formatter",
            )            
            
            enable_dataframe_formatter()            
           
    display.display(dataframe[:20000] if COLAB_ON else dataframe) 
  
  
 #show a timeseries graph of a selected attribute
def plot_dataset(
    dataframe: DataFrame,
    column: str,
) -> None:
    figure = Figure()
    scatter = Scatter(
        x=dataframe.index,
        y=dataframe[column],
        mode="lines",
        name="values",
    )

    figure.add_trace(scatter)
    figure.update_layout(
        showlegend=False,
        title_text=column,
        template="simple_white",
    )

    figure.update_xaxes(showgrid=True)
    figure.show()

In [None]:

# buses dataset
buses_filename = "buses.parquet"
buses_path = BASE_DIRECTORY.joinpath(buses_filename)
buses_dataset = load_data(buses_path)


# classes dataset
classes_filename = "classes.parquet"
classes_path = BASE_DIRECTORY.joinpath(classes_filename)
classes_dataset = load_data(classes_path)

## Step 1 (SOLUTION): Sound protection with differential privacy

### Fourier Perturbation Algorithm (FPA)

In [None]:
# Perturb timeline series with differential privacy
def fpa(Q: ndarray, δ: float, ε: float, k: int) -> ndarray:

    # define a laplace mechanism for perturbation
    def lpa(Q: ndarray, δ: float, ε: float) -> ndarray:
        # differential privacy scale based on the budget
        λ = δ / ε

        # Laplace mechanism applied to whole serie
        Z = np.random.laplace(scale=λ, size=Q.size)

        return Q + Z

    # discrete Fourier trasform
    F = np.fft.fft(Q)

    # first k values of DFT
    F_k = F[:k]

    # lpa of F_k
    Fλ_k = lpa(F_k.real, δ, ε) + 1j * lpa(F_k.imag, δ, ε)

    # Fλ_k with `n - k` zero-padding
    Fλ_n = np.pad(Fλ_k, (0, Q.size - k))

    # inverse discrete Fourier transform
    Qλ = np.fft.ifft(Fλ_n)

    # modulus of complex values of IFFT
    Qλ_m = np.absolute(Qλ)

    # round perturbation to integers
    Qλ_int = np.rint(Qλ_m)

    # replace negative values with zeroes
    Qλ_int[Qλ_int < 0] = 0

    return Qλ_int


# perform a noise perturbation with the Rastogi algorithm
def fourier_perturbation(
    sequence: Series,
    boundary: float,
    budget: float,
    coefficients: int,
) -> Optional[ndarray]:

    # calculate the L-norm of a uniform vector of seed values
    def norm(seed: float, size: int, order: int) -> float:
        serie = np.full((size,), seed)
        return linalg.norm(serie, order)

    size = sequence.size
    if size  > coefficients:
        sensitivity = math.sqrt(coefficients) * norm(boundary, size, 2)
        return fpa(
            sequence.to_numpy(),
            sensitivity,
            budget,
            coefficients,
        )

    return None


def bound(
    serie: Series,
    aggregate: str,
) -> float:
    def ceil(serie: Series) -> float:
        maximum = serie.max()
        # maximum = linalg.norm(Q, np.inf)
        # # round(maximum, -1)
        return 10 * math.ceil(maximum / 10)

    return {
        "count": 1,
        "sum": ceil(serie),
    }.get(aggregate, NA)


def get_fourier_perturbations(
    dataframe: DataFrame,
    agg_sizes: Sequence[int],
    coefficients: Sequence[int],
    epsilons: Sequence[float],
    stops: Optional[Sequence[str]],
) -> DataFrame:    
    dataframe_ = dataframe.copy()
    if stops:
        dataframe_ = dataframe_[dataframe_["stop_name"].isin(stops)]

    # count validations by bus stop (per user and timestamp)
    dataframe_ = (
        dataframe_.groupby(["id", "departure_time"])
        .count()["stop_id"]
        .to_frame(name="validation")
        .reset_index()        
    )

    samples = DataFrame()
    for n in agg_sizes:
        subset = dataframe_["id"].drop_duplicates().sample(n).values
        mask = dataframe_["id"].isin(subset)
        sample = dataframe_[mask].reset_index(drop=True)
        sample = sample.assign(n=n).drop("id", axis=1)
        samples = samples.append(sample)

    fpas = DataFrame()
    for n in agg_sizes:
        sample = samples.query(f"n=={n}")
        reference = sample.groupby("departure_time").aggregate("count")
        boundary = bound(sample["validation"], "count")
        for k, ε in itertools.product(coefficients, epsilons):
            iteration = reference.copy()
            iteration = iteration.assign(n=n, ε=ε, k=k)
            iteration["fpa"] = fourier_perturbation(
                iteration["validation"],
                boundary,
                ε,
                k,
            )

            iteration["noise"] = iteration["fpa"] - iteration["validation"]
            fpas = fpas.append(iteration)

    return fpas

def facet_plot(
    dataframe: DataFrame,
    size: int,
    row: str,
    col: str,
) -> None:
    dataset = dataframe.query(f"n=={size}").reset_index()
    figure = px.line(
        dataset,
        x="departure_time",
        y="fpa",
        facet_row=row,
        facet_col=col,
        labels = {'departure_time': '', 'fpa': ''},
        #facet_row_spacing=0.01,
        #facet_col_spacing=0.01,
    )
                                                                                                                                    
    figure.update_yaxes(matches=None, showticklabels=False)
    figure.update_xaxes(showticklabels=False)
    #figure.update_coloraxes(showscale=False)
                                                                                                                                                    
    trace = Scatter(
        x=dataset.departure_time, 
        y=dataset.validation,
        name="count", 
        line=dict(color="gray", width=0.1, dash="dot"),  
        opacity=0.35,
    )

    trace.update(showlegend=False)
    for i, _ in enumerate(dataset[row].unique(), start=1):
        for j, _ in enumerate(dataset[col].unique(), start=1):
            figure.add_trace(trace, row=i, col=j)

    figure.update_layout(
        template="plotly_white",
        title=f"FPA for n={size}",
        xaxis_title="date",
        yaxis_title="count"
    )

    figure.show()

In [None]:
# target bus stops
beaulieu = [
    "Les Préales",
    "Tournebride",
    "Beaulieu Chimie",
    "Beaulieu INSA",
    "Beaulieu Restau U",
]

# aggregate size
Ν = [3500]

# Fourier coefficients
Κ = [10, 20, 30, 40] #, 50] ## colab max len=4

# perturbation budget
Ε = [0.01, 0.1] #, 1.0, 10.0] ## colab max len=2

fpas = get_fourier_perturbations(buses_dataset, Ν, Κ, Ε, stops=beaulieu)
facet_plot(fpas, 3500, row="ε", col="k")

####################
# BEGIN : Observe

In [None]:
# END : Observe
####################

### Training a *safe* neural network


#### Pre-process raw data


In [None]:
# pre processing transportation data
def pre_process_by_aggregation(
    dataframe: DataFrame,
    *,
    stops: Optional[Sequence[str]],
    ignore_weekend: bool = False,
) -> DataFrame:

    dataframe_ = dataframe.copy()
    # filter data from 'bus_stops' only
    if stops:
        dataframe_ = dataframe_[dataframe_["stop_name"].isin(stops)]

    # remove weekend information
    if ignore_weekend:
        dataframe_ = dataframe_.set_index("departure_time")
        dataframe_ = dataframe_[dataframe_.index.dayofweek < 5]

    # aggregate dataset by stop name and departure time
    dataframe_ = (
        dataframe_.groupby(
            [
                "stop_name",
                "departure_time",
            ]
        )
        .agg({"count": "sum"})
        .reset_index()
    )

    return dataframe_.groupby("departure_time").sum()
    
 
def post_processing_by_perturbation(
    dataframe: DataFrame,
    *,
    budget: float,
    coefficients: int,
) -> DataFrame:
    
    dataframe_ =dataframe.copy()
    boundary = bound(dataframe_["count"], "count")
    dataframe_["fpa"] = fourier_perturbation(
        dataframe_["count"], 
        boundary, 
        budget, 
        coefficients,
    )
    
    return dataframe_

In [None]:
aggregated_buses_dataset = pre_process_by_aggregation(
    buses_dataset,
    stops=beaulieu,
)

aggregated_buses_dataset = post_processing_by_perturbation(
    aggregated_buses_dataset,
    budget=0.01, 
    coefficients=40,
)

plot_dataset(aggregated_buses_dataset, "fpa")

#### Number of validations

In [None]:
plot_dataset(aggregated_buses_dataset, "count")

#### Number of students

In [None]:
display_dataframe(classes_dataset)
plot_dataset(classes_dataset, "nombre_etudiant")

In [None]:
# Merge datasets
def merge_datasets(
    classes: DataFrame,
    buses: DataFrame,
) -> DataFrame:

    # ignore dataset entries that are not available in classes timeline
    buses_ = buses[
        buses.index
        <= classes.index.max()
        + Timedelta(
            1,
            unit="day",
        )
    ]

    # merge datasets
    dataset = pd.merge(
        classes,
        buses_,
        how="outer",
        left_index=True,
        right_index=True,
    )

    # fill empty values
    dataset = dataset.fillna(0)

    return dataset

#### Display the merged dataset (students buses, and perturbed buses)

In [None]:
dataset = merge_datasets(classes_dataset, aggregated_buses_dataset)
display_dataframe(dataset)

#### Data preparation for the neural network

In [None]:
# Add features (motifs) to the dataset
la_rentree = Timestamp("2021-09-06")
la_toussaint = Timestamp("2021-11-01")
one_week_timedelta = Timedelta(7, unit="day")

# bucketize attribute
def onehot_encode(
    dataframe: DataFrame,
    column: str,
) -> DataFrame:
    dummies = pd.get_dummies(
        dataframe[column],
        prefix=column,
    )

    return pd.concat(
        [dataframe, dummies],
        axis=1,
    ).drop(columns=[column])


# encode (time) column as periodic wave
def periodic_encode(
    dataframe: DataFrame,
    column: str,
    period: int,
    start_num: int = 0,
) -> DataFrame:
    kwargs = {
        f"sin_{column}": lambda x: np.sin(
            2 * np.pi * (dataframe[column] - start_num) / period
        ),
        f"cos_{column}": lambda x: np.cos(
            2 * np.pi * (dataframe[column] - start_num) / period
        ),
    }

    return dataframe.assign(**kwargs).drop(columns=[column])


# add uniform timeindex column
def set_time_index(
    dataframe: DataFrame,
    frequence: int = 15,
) -> DataFrame:
    dataframe_ = dataframe.copy()

    # add a time index using the frequency
    dataframe_["time_idx"] = dataframe_.index - dataframe_.index.min()
    dataframe_["time_idx"] = (
        dataframe_["time_idx"].astype("timedelta64[m]") // frequence
    )
    dataframe_["time_idx"] = dataframe_["time_idx"].astype("int_")
    return dataframe_


# mark dataset ranges as holidays
def label_holidays(
    dataframe: DataFrame,
    start: Timestamp,
    end: Timestamp,
    column="holiday",
) -> DataFrame:
    dataframe_ = dataframe.copy()
    dataframe_[column] = 0
    dataframe_.loc[
        (dataframe_.index >= start) & (dataframe_.index < end),
        column,
    ] = 1
    return dataframe_


# generate lags (to track interaction throughout time)
def generate_lags(
    dataframe: DataFrame,
    lags: int,
    column: str,
) -> DataFrame:
    dataframe_ = dataframe.copy()
    for n in range(1, lags + 1):
        dataframe_[f"{column}_lag_{n}"] = dataframe_[column].shift(n)

    return dataframe_.fillna(0)


# add features to the dataset
def add_features(
    dataframe: DataFrame,
    bucketize_date: bool = True,
    periodic_time: bool = True,
    holidays: bool = False,
    timeindex: bool = False,
    lags: bool = False,
    n_lags: int = 50,
) -> DataFrame:
    dataframe_ = dataframe.copy()
    if timeindex:
        dataframe_ = set_time_index(dataframe_)

    if bucketize_date:
        dataframe_ = dataframe_.assign(dayofweek=dataframe_.index.dayofweek)
        # .assign(day=dataframe.index.day)
        # .assign(month=dataset.index.month)
        dataframe_ = onehot_encode(dataframe_, "dayofweek")
        # dataset = onehot_encode(dataset, "month")

    if periodic_time:
        dataframe_ = dataframe_.assign(hour=dataframe_.index.hour)
        dataframe_ = dataframe_.assign(minute=dataframe_.index.minute)
        dataframe_ = periodic_encode(dataframe_, "hour", 24, 0)
        dataframe_ = periodic_encode(dataframe_, "minute", 60, 0)

    if holidays:
        dataframe_ = label_holidays(
            dataframe_,
            la_toussaint,
            la_toussaint + one_week_timedelta,
        )

    if lags:
        dataframe_ = generate_lags(dataframe_, n_lags, "count")
        dataframe_ = generate_lags(dataframe_, n_lags, "nombre_etudiant")

    # dataframe.drop(["nombre_etudiant"], axis=1, inplace=True)
    return dataframe_

In [None]:
dataset = add_features(dataset, holidays=True)

#### Split datasets to train a machine learning tool

In [None]:
# Split the data into test, validation, and train sets
def features_split(
    dataframe: DataFrame,
    target: str,
) -> Tuple[DataFrame, DataFrame]:
    y = dataframe[[target]]
    X = dataframe.drop(columns=[target])
    return X, y


def get_timestamp_bound(
    dataframe: DataFrame,
    weeks: int,
) -> Timestamp:
    timedelta = Timedelta(7 * weeks - 1, unit="day")
    timestamp = dataframe.index.min() + timedelta
    return timestamp.normalize()

#### Define the neural network


In [None]:
# Define and run a RNN model
class LSTMModel(Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout):
        super().__init__()

        # Defining the number of layers and the nodes in each layer
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # LSTM layers
        self.lstm = LSTM(
            input_dim,
            hidden_dim,
            layer_dim,
            batch_first=True,
            dropout=dropout,
        )

        # Fully connected layer
        self.fc = Linear(hidden_dim, output_dim)

    def forward(self, x):
        # initializing hidden state for first input with zeros
        h0 = torch.zeros(
            self.layer_dim,
            x.size(0),
            self.hidden_dim,
        ).requires_grad_()

        # initializing cell state for first input with zeros
        c0 = torch.zeros(
            self.layer_dim,
            x.size(0),
            self.hidden_dim,
        ).requires_grad_()

        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        # Forward propagation by passing in the input, hidden state, and cell state into the model
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        # (squeezing is equivalent to: `out = out[:, -1, :]`)
        out = torch.squeeze(out)

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        return out

#### Configure the neural network

In [None]:

HIDDEN_DIM = 64
LAYER_DIM = 3
BATCH_SIZE = 64
EPOCHS = 100

#### Train the neural network

In [None]:
# Helper to train the NN model
class RunnerHelper:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []

    def train_step(self, X, y):

        # set model to train mode
        self.model.train()

        # make predictions
        ŷ = self.model(X)

        # compute loss
        loss = self.loss_fn(ŷ, y)

        # compute gradients
        loss.backward()

        # update parameters
        self.optimizer.step()

        # reset to zero gradients
        self.optimizer.zero_grad()

        # returns loss
        return loss.item()

    def val_step(self, X, y):

        # set model to eval mode
        self.model.eval()

        # make prediction
        ŷ = self.model(X)

        # compute loss
        loss = self.loss_fn(ŷ, y)

        # return loss
        return loss.item()

    def train(self, train_loader, val_loader, n_epochs=50):
        model_path = f'{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
        for epoch in range(1, n_epochs + 1):
            batch_train_losses = []
            for x_train, y_train in train_loader:
                # x_train = x_train.view([batch_size, -1, n_features]).to(DEVICE)
                x_train = torch.unsqueeze(x_train, 1)
                train_loss = self.train_step(x_train, y_train)
                batch_train_losses.append(train_loss)

            training_loss = np.mean(batch_train_losses)
            self.train_losses.append(training_loss)
            with torch.no_grad():
                batch_val_losses = []
                for x_val, y_val in val_loader:
                    # x_val = x_val.view([batch_size, -1, n_features]).to(DEVICE)
                    x_val = torch.unsqueeze(x_val, 1)
                    val_loss = self.val_step(x_val, y_val)
                    batch_val_losses.append(val_loss)

                validation_loss = np.mean(batch_val_losses)
                self.val_losses.append(validation_loss)

            if (epoch <= 10) | (epoch % 20 == 0):
                print(
                    f"[{epoch:3d}/{n_epochs}] Training loss: {training_loss:.4f}"
                    f"\t Validation loss: {validation_loss:.4f}"
                )

        # torch.save(self.model.state_dict(), model_path)

    def evaluate(self, test_loader):
        with torch.no_grad():
            predictions = []
            values = []
            for x_test, y_test in test_loader:
                # x_test = x_test.view([batch_size, -1, n_features]).to(DEVICE)
                x_test = torch.unsqueeze(x_test, 1)
                self.model.eval()
                ŷ = self.model(x_test)
                predictions.append(ŷ.detach().numpy())
                values.append(y_test.detach().numpy())

        return predictions, values

    def plot_losses(self):
        figure = Figure()
        tics = [*range(len(self.train_losses) + 1)]
        value = Scatter(
            x=tics,
            y=self.train_losses,
            mode="lines",
            name="Training",
            marker=dict(),
        )

        figure.add_trace(value)
        value = Scatter(
            x=tics,
            y=self.val_losses,
            mode="lines",
            name="Validation",
            marker=dict(),
        )

        figure.add_trace(value)
        figure.update_layout(title_text="Losses")
        figure.update_xaxes(title_text="epoch")
        figure.update_yaxes(title_text="loss (%)")
        figure.show()


# rescale results and align it to original time index
def inverse_transform(
    values: Sequence[ndarray],
    predictions: Sequence[ndarray],
    index: DatetimeIndex,
    scaler: MinMaxScaler,
) -> DataFrame:
    vals = np.concatenate(values, axis=0).ravel()
    preds = np.concatenate(predictions, axis=0).ravel()
    dataframe = DataFrame(
        data={
            "value": vals,
            "prediction": preds,
        },
        index=index[: len(vals)],
    )

    dataframe = dataframe.sort_index()
    dataframe = DataFrame(
        scaler.inverse_transform(dataframe),
        columns=dataframe.columns,
        index=dataframe.index,
    )

    return dataframe.astype("int_")


# formating data for NN
def to_dataloaders(
    dataframe_train: Tuple[DataFrame, DataFrame],
    dataframe_val: Tuple[DataFrame, DataFrame],
    dataframe_test: Tuple[DataFrame, DataFrame],
    scaler: MinMaxScaler,
    batch_size,
    shuffle=False,
    drop_last=True,
) -> Tuple[DataLoader, DataLoader, DataLoader]:

    # scale data
    X_train_arr = scaler.fit_transform(dataframe_train[0])
    X_val_arr = scaler.transform(dataframe_val[0])
    X_test_arr = scaler.transform(dataframe_test[0])

    y_train_arr = scaler.fit_transform(dataframe_train[1])
    y_val_arr = scaler.transform(dataframe_val[1])
    y_test_arr = scaler.transform(dataframe_test[1])

    # transform scaled data to tensors
    train_features = Tensor(X_train_arr)
    train_targets = Tensor(y_train_arr)
    val_features = Tensor(X_val_arr)
    val_targets = Tensor(y_val_arr)
    test_features = Tensor(X_test_arr)
    test_targets = Tensor(y_test_arr)

    # setup tensor datasets
    train = TensorDataset(train_features, train_targets)
    val = TensorDataset(val_features, val_targets)
    test = TensorDataset(test_features, test_targets)

    # setup (tensor) datasets loaders
    train_loader = DataLoader(
        train,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
    )

    val_loader = DataLoader(
        val,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
    )

    test_loader = DataLoader(
        test,
        batch_size=1,
        shuffle=shuffle,
        drop_last=drop_last,
    )

    return train_loader, val_loader, test_loader

In [None]:
dataset_ = dataset.drop(labels=["count"], axis=1)
end_train = get_timestamp_bound(dataset_, weeks=9)
end_val = get_timestamp_bound(dataset_, weeks=10)

train_dataset = dataset_[dataset_.index < end_train]
val_dataset = dataset_[(dataset_.index >= end_train) & (dataset_.index < end_val)]
test_dataset = dataset_[dataset_.index >= end_val]

X_train, y_train = features_split(train_dataset, target="fpa")
X_val, y_val = features_split(val_dataset, target="fpa")
X_test, y_test = features_split(test_dataset, target="fpa")

input_dim = len(X_train.columns) 
model = LSTMModel(
    input_dim=input_dim,
    hidden_dim=HIDDEN_DIM,
    layer_dim=LAYER_DIM,
    output_dim=1,
    dropout=0.2,
)

scaler = MinMaxScaler() 
loss_fn = MSELoss() 
optimizer = Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
runner = RunnerHelper(model=model, loss_fn=loss_fn, optimizer=optimizer)
train_loader, val_loader, test_loader = to_dataloaders(
    (X_train, y_train),
    (X_val, y_val),
    (X_test, y_test),    
    scaler,
    BATCH_SIZE,
)

runner.train(train_loader, val_loader, n_epochs=EPOCHS)
runner.plot_losses()
predictions, values = runner.evaluate(test_loader)
fpa_result = inverse_transform(values, predictions, X_test.index, scaler)

In [None]:
def plot_residuals(
    dataframe: DataFrame,
) -> None:
    hovertext = []
    for i in range(dataframe.shape[0]):
        hovertext.append(
            f"{dataframe.index[i]}<br>"
            f"Real: {dataframe['value'][i]}<br>"
            f"Prediction: {dataframe['prediction'][i]}"
        )

    figure = Figure(
        data=[
            Scatter(
                x=dataframe.index,
                y=dataframe["value"],
                mode="lines",
                name="reference",
                line=dict(color="lightgrey", width=0.6, dash="dot"),
                # opacity=0.6,
                showlegend=False,
            ),
            Scatter(
                x=dataframe.index,
                y=dataframe["prediction"],
                mode="lines",
                name="prediction",
                line=dict(color="lightblue", width=0.6, dash="dot"),
                showlegend=False,
                # opacity=0.6,
            ),
            Candlestick(
                x=dataframe.index,
                open=dataframe["value"],
                high=dataframe["prediction"],
                low=dataframe["prediction"],
                close=dataframe["value"],
                text=hovertext,
                hoverinfo="text",
                name="residuals",
                # line=dict(width=2),
                increasing_line_color="lightseagreen",
                decreasing_line_color="lightsalmon",
            ),
        ]
    )

    figure.update_layout(
        title="Prediction residuals",
        template="simple_white",
        xaxis_rangeslider_visible=True,
    )

    figure.show()

### Comparing the neural network against with original dataset

In [None]:
dataset_ = dataset.drop(labels=["fpa"], axis=1)
end_train = get_timestamp_bound(dataset_, weeks=9)
end_val = get_timestamp_bound(dataset_, weeks=10)

train_dataset = dataset_[dataset_.index < end_train]
val_dataset = dataset_[(dataset_.index >= end_train) & (dataset_.index < end_val)]
test_dataset = dataset_[dataset_.index >= end_val]

X_train, y_train = features_split(train_dataset, target="count")
X_val, y_val = features_split(val_dataset, target="count")
X_test, y_test = features_split(test_dataset, target="count")

input_dim = len(X_train.columns) 
model = LSTMModel(
    input_dim=input_dim,
    hidden_dim=HIDDEN_DIM,
    layer_dim=LAYER_DIM,
    output_dim=1,
    dropout=0.2,
)

scaler = MinMaxScaler() 
loss_fn = MSELoss() 
optimizer = Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
runner = RunnerHelper(model=model, loss_fn=loss_fn, optimizer=optimizer)
train_loader, val_loader, test_loader = to_dataloaders(
    (X_train, y_train),
    (X_val, y_val),
    (X_test, y_test),    
    scaler,
    BATCH_SIZE,
)

runner.train(train_loader, val_loader, n_epochs=EPOCHS)
runner.plot_losses()
predictions, values = runner.evaluate(test_loader)
lstm_result = inverse_transform(values, predictions, X_test.index, scaler)

 ### Visualize the predictions of the two models

In [None]:
def plot_models_prediction_interval(
    dataframe: DataFrame,
    rnn_dataframe: DataFrame,
    baseline_dataframe: DataFrame,
) -> None:
    figure = Figure()
    value = Scatter(
        x=dataframe.index,
        y=dataframe["count"],
        mode="lines",
        name="Reference",
        line=dict(color="rgba(0,0,0, 0.3)", width=1, dash="dot"),
    )

    figure.add_trace(value)
    baseline = Scatter(
        x=baseline_dataframe.index,
        y=baseline_dataframe.prediction,
        mode="lines",
        name="Standard forecasting",
        opacity=0.8,
    )

    figure.add_trace(baseline)
    prediction = Scatter(
        x=rnn_dataframe.index,
        y=rnn_dataframe.prediction,
        mode="lines",
        name="FPA forecasting",        
        opacity=0.8,        
    )

    figure.add_trace(prediction)
    figure.update_layout(
        showlegend=True,
        title_text="Predictions",
        template="simple_white",
        xaxis=dict(
            range=[
                rnn_dataframe.index.min(),
                rnn_dataframe.index.max(),
            ],
        ),
    )

    figure.update_xaxes(rangeslider_visible=True)
    figure.show()

In [None]:
plot_models_prediction_interval(dataset, fpa_result, lstm_result)

# References

- Vibhor Rastogi and Suman Nath. Differentially private aggregation of distributed time-series with transformation and encryption. Proceedings of the 2010 ACM SIGMOD International Conference on Management of data, June 2010, Indianapolis (IN) USA [[DOI]](https://doi.org/10.1145/1807167.1807247).