## Time Series Forecasting with CNN and Grad-CAM

In [None]:
import numpy as np
import torch
import pandas as pd
import os
from datetime import datetime
import time
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torchvision.datasets.utils import download_and_extract_archive
from torch import nn
from torch.utils.data import Dataset, DataLoader
import shap

colormap = ListedColormap(["#ff595e","#ffca3a","#8ac926","#52a675","#1982c4","#6a4c93"], name="Custom")

plt.rcParams['axes.prop_cycle'] = plt.cycler(color=colormap.colors)
plt.rcParams['axes.axisbelow'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False


In [None]:
csv_filename = "jena_climate_2009_2016.csv"
uri = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
download_and_extract_archive(url=uri, download_root=os.getcwd(), filename=csv_filename+".zip")

df = pd.read_csv(csv_filename)
# convert 'Date Time' to datetime object
df['Date Time'] = df['Date Time'].apply(lambda x: datetime.strptime(x, "%d.%m.%Y %H:%M:%S"))

# convert angle to cos & sin
df["cos(wd)"] = df["wd (deg)"].apply(lambda x: np.cos(x / 360 * 2 * np.pi))
df["sin(wd)"] = df["wd (deg)"].apply(lambda x: np.sin(x / 360 * 2 * np.pi))

df = df.drop(["wd (deg)"], axis=1)

features = df.columns.values[1:]

# df = df.iloc[:20000]

display(df)

In [None]:
# check for outliers

remove_outliers = True
plot_on_timeseries = True

if plot_on_timeseries:
    fig, ax = plt.subplots(figsize=(16, 3))

for i, X in enumerate(features):
    q05 = df[X].quantile(0.05)
    q95 = df[X].quantile(0.95)
    iqr = df[X].quantile(0.75) - df[X].quantile(0.25)
    
    lower_bound = q05 - 3 * iqr
    upper_bound = q95 + 3 * iqr
    
    outliers = ((df[X] < lower_bound) | (df[X] > upper_bound))

    if outliers.sum() > 0:
        print("%s has %i outliers (lower=%.4g, upper=%.4g)" % (X, outliers.sum(), lower_bound, upper_bound))
    
    if plot_on_timeseries:
        label = X
        for obs in df.loc[outliers, "Date Time"]:
            ax.axvline(x=obs, color=colormap(i), label=label)
            label=None
    
    if remove_outliers:
        df = df[~outliers]
        

if plot_on_timeseries:
    ax.set_title("Outliers")
    ax.legend(bbox_to_anchor=(0.5, 1.2), loc='center', ncol=10)

In [None]:
correlation_matrix = df[features].corr()
redundant_features = []
for i in range(len(features)):
    for j in range(i+1, len(features)):
        col_i = features[i]
        col_j = features[j]
        
        if correlation_matrix.loc[col_i, col_j] > 0.98:
            print("%s and %s are redundant" % (col_i, col_j))
            redundant_features.append(col_j)

df = df.drop(redundant_features, axis=1, errors='ignore')

features = df.columns.values[1:]

## Visualization

In [None]:
plot_cols = 4
plot_rows = len(features) // plot_cols + min(len(features) % plot_cols, 1)

fig, axes = plt.subplots(figsize=(16, 3 * plot_rows), nrows=plot_rows, ncols=plot_cols)

for ax, X in zip(axes.flatten(), features):
    ax.set_title(X)
    ax.hist(df[X], bins=50)
    ax.set_yticks([])

fig.tight_layout()

In [None]:
plot_cols = 2
plot_rows = len(features) // plot_cols + min(len(features) % plot_cols, 1)

fig, axes = plt.subplots(figsize=(16, 3 * plot_rows), nrows=plot_rows, ncols=plot_cols)

sample_low = 0
sample_high = 52560 * 2 # 2 years
sample_high = -1
skip = 6

for ax, X in zip(axes.flatten(), features):
    ax.set_title(X)
    plot_x = df["Date Time"].iloc[sample_low:sample_high:skip]
    plot_y = df[X].iloc[sample_low:sample_high:skip]
    ax.plot(plot_x, plot_y, linewidth=1)

fig.tight_layout()

## Split, scale & transform

In [None]:
hours_future = 12
hours_past = 168
skip = 6

time_array = df['Date Time'].iloc[:-(hours_future + hours_past)*6:skip]
X = df[features].iloc[:-(hours_future + hours_past)*6:skip]
y = df['T (degC)'].shift(-(hours_future + hours_past)*6).iloc[:-(hours_future + hours_past)*6:skip]

# split into test and train datasets
time_train, time_test, X_train, X_test, y_train, y_test = train_test_split(time_array, X, y, test_size=0.3, shuffle=False)

# scale datasets
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# switch from pandas Series to numpy array
y_train = y_train.values
y_test = y_test.values

#### Prepare data for PyTorch

In [None]:
def prepare_sequences(X, sequence_length=168):
    X_seq = []
    for i in range(sequence_length):
        X_roll = X.roll(-i, dims=0)
        X_seq.append(X_roll)
    
    X_seq = torch.stack(X_seq, dim=2)
    X_seq = X_seq[:-sequence_length]
    
    return X_seq


def prepare_data(X, y, time_array, hours_past=168, hours_future=12, batch_size=32):
    X = torch.Tensor(X)
    y = torch.Tensor(y)
    
    X_ts = prepare_sequences(X, hours_past)
    y_ts = y[:-hours_past]
    time_array = time_array[:-hours_past]
    
    # we verify which observations are valid i.e. there is proper time difference between the observations
    valid_observations = ((time_array.diff(-hours_past).dt.total_seconds() / 3600 ) ==  -hours_past ) & ((time_array.diff(hours_future).dt.total_seconds() / 3600 ) ==  hours_future )
    
    time_ts = time_array[valid_observations]
    X_ts = X_ts[valid_observations.values]
    y_ts = y_ts[valid_observations.values]
    
    return time_ts, X_ts, y_ts

time_ts_train, X_ts_train, y_ts_train = prepare_data(X_train, y_train, time_train)
time_ts_test, X_ts_test, y_ts_test = prepare_data(X_test, y_test, time_test)

In [None]:
class WeatherDataset(Dataset):
    def __init__(self,features, target):
        self.features = features
        self.target = target
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self,idx):
        item = self.features[idx]
        label = self.target[idx]
        
        return item,label
    
train = WeatherDataset(X_ts_train, y_ts_train)
test = WeatherDataset(X_ts_test, y_ts_test)

train_loader = DataLoader(train, batch_size=64, shuffle=False)
test_loader = DataLoader(test, batch_size=64, shuffle=False)

## Build model and train

In [None]:
class CNN(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.n_features = n_features
        
        self.network = nn.Sequential(
            nn.Conv1d(self.n_features, 32, kernel_size=3), # 32 x 166
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3), # 64 x 164
            nn.ReLU(),
            nn.MaxPool1d(2), # 64 x 82

            nn.Flatten(), 
            nn.Linear(64*82, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        self.conv1d_1 = nn.Conv1d(n_features, 32, kernel_size=3)
        self.conv1d_2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool1d = nn.MaxPool1d(2)
        self.flat = nn.Flatten()
        self.fc1 = nn.Linear(128, 50)
        self.fc2 = nn.Linear(50, 1)
    
    def forward(self, x):
        return self.network(x)

    
device = torch.device("cpu")
model = CNN(len(features)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_function = nn.MSELoss()

In [None]:
def fit(model, data_loader):
    running_loss = .0
    model.train()
    
    for idx, (inputs, labels) in tqdm(enumerate(data_loader), total=data_loader.__len__(), disable=True):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        preds = model(inputs.float())[:, 0]
        
        loss = loss_function(preds ,labels)
        loss.backward()
        optimizer.step()
        running_loss += loss
        
    train_loss = running_loss/len(data_loader)
    train_loss = train_loss.detach().numpy()
    return train_loss

def validate(model, data_loader):
    running_loss = .0
    model.eval()
    
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(data_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            preds = model(inputs.float())[:, 0]
            loss = loss_function(preds,labels)
            running_loss += loss
            
        valid_loss = running_loss/len(data_loader)
        valid_loss = valid_loss.detach().numpy()
        
        return valid_loss

train_losses = []
valid_losses = []

epochs = 30
for epoch in range(epochs):
    start = time.time()
    train_loss = fit(model, train_loader)
    train_losses.append(train_loss)
    valid_loss = validate(model, test_loader)
    valid_losses.append(valid_loss)
    time_elapsed = time.time() - start
    
    print('Epochs %i/%i (%.3g seconds)\n    Train loss = %.3g \n    Valid loss = %.3g' % (epoch+1, epochs, time_elapsed, train_loss, valid_loss))

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
epochs_passed = len(train_losses)
ax.plot(range(1, epochs_passed + 1), train_losses, label="Train", linewidth=2)
ax.plot(range(1, epochs_passed + 1), valid_losses, label="Validation", linewidth=2)
ax.set_xlabel("Epochs")
ax.set_ylabel("Loss")
ax.set_xlim(1, len(train_losses))
ax.legend()
# ax.set_yscale('log')
# ax.set_yticks([1, 10, 100])
ax.grid(axis='y', alpha=0.5)
fig.tight_layout()

## Predict and evaluate

In [None]:
y_test_pred = model.forward(X_ts_test).detach().numpy()

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
ax.scatter(y_ts_test, y_test_pred, s=1)
ax.plot([-15, 35], [-15, 35], linewidth=1, color="black", linestyle="--")
ax.set_xlim(-15, 35)
ax.set_ylim(-15, 35)
ax.set_xlabel("Target values")
ax.set_ylabel("Predicted values")
fig.tight_layout()


In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(time_ts_test[-600:], y_ts_test[-600:], label="Target")
ax.plot(time_ts_test[-600:], y_test_pred[-600:], label="Prediction")
ax.set_ylabel("Temperature [C]")
ax.legend()
fig.tight_layout()

## Explanations

In [None]:
# batch of data for explanations

x_batch, y_batch = next(iter(test_loader))
x = x_batch[0:1]
y = y_batch[0:1]

### SHAP

In [None]:
# TODO
# explainer = shap.GradientExplainer((model, model.network[0]), X_ts_test)

### Grad-CAM

In [None]:
# references
# https://github.com/jacobgil/pytorch-grad-cam/issues/233
# https://medium.com/@stepanulyanin/implementing-grad-cam-in-pytorch-ea0937c31e82
# https://arxiv.org/pdf/2001.07582.pdf

In [None]:
class CNN_GradCAM(nn.Module):
    def __init__(self, model, last_pooling_idx):
        super(CNN_GradCAM, self).__init__()
        
        self.model = model
        self.detached_model = self.model.network[:last_pooling_idx]
        self.last_pool = self.model.network[last_pooling_idx]
        self.fc = self.model.network[last_pooling_idx+1:]
        self.gradients = None
        
        
    def activations_hook(self, grad):
        self.gradients = grad
        
    def forward(self, x):
        x = self.detached_model(x)
        h = x.register_hook(self.activations_hook)
        x = self.last_pool(x)
        x = self.fc(x)
        
        return x
    
    def get_activations_gradients(self):
        return self.gradients
    
    def get_activation(self, x):
        return self.detached_model(x)
    

cam = CNN_GradCAM(model, 4)
cam.eval()

pred = cam(x)

In [None]:
pred.backward()

In [None]:
gradients = cam.get_activations_gradients()

In [None]:
gradients

In [None]:
pooled_gradients = torch.mean(gradients, dim=[0, 1])

In [None]:
for i in range(164):
    activations[:, :, i] *= pooled_gradients[i]

In [None]:
torch.mean(gradients, dim=[0, 2])

In [None]:
activations = cam.get_activation(x).detach()

In [None]:
activations.shape

In [None]:
X_ts

In [None]:
heatmap = torch.mean(activations, dim=2).squeeze()
heatmap = np.maximum(heatmap, 0)

In [None]:
plt.plot(heatmap)

In [None]:
plt.matshow(heatmap.squeeze())