# Sequential deep learning models

In [1]:
#import numpy as np
#import matplotlib.pyplot as plt
#import pandas as pd

In [1]:
from copy import deepcopy
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from datetime import datetime

### Vanilla RNN

In [87]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(RNNModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # RNN layers
        self.rnn = nn.RNN(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # Forward propagation by passing in the input and hidden state into the model
        self.rnn.flatten_parameters() # ------------------------------------------------------------------
        out, h0 = self.rnn(x, h0.detach())

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)
        return out

### LSTM

In [88]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(LSTMModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # LSTM layers
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # Initializing cell state for first input with zeros
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        # Forward propagation by passing in the input, hidden state, and cell state into the model
        self.lstm.flatten_parameters() # ------------------------------------------------------------------
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        return out

### GRU

In [89]:
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(GRUModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.layer_dim = layer_dim
        self.hidden_dim = hidden_dim

        # GRU layers
        self.gru = nn.GRU(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # Forward propagation by passing in the input and hidden state into the model
        self.gru.flatten_parameters() # ------------------------------------------------------------------
        out, _ = self.gru(x, h0.detach())

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        return out

### Model initialization

In [90]:
def get_model(model, model_params):
    models = {
        "rnn": RNNModel,
        "lstm": LSTMModel,
        "gru": GRUModel,
    }
    return models.get(model.lower())(**model_params)

### Optimisation

In [91]:
class Optimization:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []
    
    def train_step(self, x, y):
        # Sets model to train mode
        self.model.train()

        # Makes predictions
        yhat = self.model(x)

        # Computes loss
        loss = self.loss_fn(y, yhat)

        # Computes gradients
        loss.backward()

        # Updates parameters and zeroes gradients
        self.optimizer.step()
        self.optimizer.zero_grad()

        # Returns the loss
        return loss.item()
    
    def train(self, train_loader, val_loader, batch_size=64, n_epochs=50, n_features=1):
        #model_path = f'{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
        
        # train on GPU
        device = torch.device('cuda')
        
        best_loss = np.infty
        best_train_loss = np.infty
        stop_criterion = 1e-5
        cnt = 0
        patience = 100

        for epoch in range(1, n_epochs + 1):
            batch_losses = []
            for x_batch, y_batch in train_loader:
                x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
                y_batch = y_batch.to(device)
                b_loss = self.train_step(x_batch, y_batch)
                batch_losses.append(b_loss)
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)

            with torch.no_grad():
                batch_val_losses = []
                for x_val, y_val in val_loader:
                    x_val = x_val.view([batch_size, -1, n_features]).to(device)
                    y_val = y_val.to(device)
                    self.model.eval()
                    yhat = self.model(x_val)
                    val_loss = self.loss_fn(y_val, yhat).item()
                    batch_val_losses.append(val_loss)
                val_loss = np.mean(batch_val_losses)
                self.val_losses.append(val_loss)

                # safe model with smallest validation loss
                if (val_loss < best_loss):
                    best_loss = val_loss
                    best_epoch = epoch
                    best_model = deepcopy(self.model)
                    
                # early stopping
                if ((best_train_loss - training_loss) > stop_criterion):
                    best_train_loss = training_loss
                    cnt = 0
                else:
                    cnt += 1
                if cnt == patience:
                    break

            #if (epoch % 100 == 0):
            #    print(
            #        f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {val_loss:.4f}"
            #      )

        #torch.save(self.model.state_dict(), model_path)
        return best_model, best_loss

    def evaluate(self, best_model, test_loader, batch_size=1, n_features=1):
        # evaluate on GPU
        device = torch.device('cuda')
        model = deepcopy(best_model)
        with torch.no_grad():
            predictions = []
            values = []
            for x_test, y_test in test_loader:
                x_test = x_test.view([batch_size, -1, n_features]).to(device)
                y_test = y_test.to(device)
                model.eval()
                yhat = model(x_test)
                predictions.append(yhat.to(device).detach().cpu().numpy())
                values.append(y_test.to(device).detach().cpu().numpy())

        return predictions, values
    
    def plot_losses(self):
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Validation loss")
        plt.legend()
        plt.title("Losses")
        plt.show()
        plt.close()

### Evaluation

In [92]:
def inverse_transform(scaler, df, columns):
    for col in columns:
        df[col] = scaler.inverse_transform(df[col])
    return df

In [93]:
def format_predictions_dl(predictions, values, df_test, scaler):
    vals = np.concatenate(values, axis=0).ravel()
    preds = np.concatenate(predictions, axis=0).ravel()
    df_result = pd.DataFrame(data={"value": vals, "prediction": preds})
    df_result = df_result.sort_index()
    df_result = inverse_transform(scaler, df_result, [["value", "prediction"]])
    return df_result

#def calculate_metrics(df):
#    return {'rmse' : mean_squared_error(df.value, df.prediction) ** 0.5,
#            'mae' : mean_absolute_error(df.value, df.prediction),
#            'mape': mean_absolute_percentage_error(df.value, df.prediction),
#            'r2' : r2_score(df.value, df.prediction)}

### Training 

In [1]:
def train_loop(model_name, model_params,num_sim,str_sims,path,seed,lags=1,batch_size = 64,n_epochs = 500):
    
    # train on GPU
    device = torch.device('cuda')
    best_seeds = np.zeros(len(str_sims))
    results = []
    preds = []

    # loop over all datasets in the simulation setup
    for j in range(len(str_sims)):
        
        # print dataset
        print('No. of dataset: ', j+1,'/',len(str_sims))

        # load data
        data_name = 'sim'+str(num_sim)+'_'+str_sims[j]
        with np.load(path+'Temp/'+data_name+'.npz') as data:
            # extract data and the given number of lagged features
            X_train = data['X_train'][:,:lags]
            X_val = data['X_val'][:,:lags]
            X_test = data['X_test'][:,:lags]
            y_train = data['y_train']
            y_val = data['y_val']
            y_test = data['y_test']

        # scale the data to 0-1
        scaler = MinMaxScaler()

        X_train_arr = scaler.fit_transform(X_train)
        X_val_arr = scaler.transform(X_val)
        X_test_arr = scaler.transform(X_test)

        y_train_arr = scaler.fit_transform(y_train)
        y_val_arr = scaler.transform(y_val)
        y_test_arr = scaler.transform(y_test)

        # convert data to tensors
        train_features = torch.Tensor(X_train_arr)
        train_targets = torch.Tensor(y_train_arr)
        val_features = torch.Tensor(X_val_arr)
        val_targets = torch.Tensor(y_val_arr)
        test_features = torch.Tensor(X_test_arr)
        test_targets = torch.Tensor(y_test_arr)

        # build tensor dataset
        train = TensorDataset(train_features, train_targets)
        val = TensorDataset(val_features, val_targets)
        test = TensorDataset(test_features, test_targets)

        # get batched data
        train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True) 
        val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)
        test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)
        test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)

        df_sim = pd.DataFrame()
        test_preds = []
        best_loss = np.infty

        # loop over all model seeds
        model_num = 0
        for i in seed:
            
            # print model number
            if ((model_num + 1) % 5 == 0): 
                print('Model seed: ',model_num + 1,'/',len(seed))

            # initialise model
            torch.manual_seed(i)
            # train on GPU
            #device = torch.device('cuda')
            model = get_model(model_name, model_params).to(device)

            # create loss function and optimizer
            loss_fn = nn.MSELoss(reduction="mean")
            optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
            opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)
            
            # train the model
            best_model_out, val_loss = opt.train(train_loader, test_loader, batch_size=batch_size, n_epochs=n_epochs, n_features=input_dim)
            if (val_loss < best_loss):
                best_loss = val_loss
                best_seeds[j] = model_num
            #opt.plot_losses()

            # evaluate on test set
            predictions, values = opt.evaluate(best_model_out, test_loader_one, batch_size=1, n_features=input_dim)
            df_result = format_predictions_dl(predictions, values, X_test, scaler)
            result_metrics = calculate_metrics(df_result)
            #print(result_metrics)

            # append metrics on test set
            df_metrics = pd.DataFrame(np.expand_dims((result_metrics['rmse'],result_metrics['mae'],result_metrics['mape'],result_metrics['r2'],),axis=0),columns=['rmse','mae','mape','r2'])
            df_sim = pd.concat([df_sim,df_metrics],axis=0, ignore_index=True)

            # append test predictions
            test_preds.append(np.asarray(df_result))
            
            # counter
            model_num += 1

        # save results
        results.append(df_sim)  

        # save test predictions
        preds.append(np.asarray(test_preds))
    
    return np.asarray(results), np.asarray(preds), best_seeds

In [173]:
#path = 'C:/Users/Meier/Dropbox (Institut für Statistik)/Structural Breaks + DL/Simulation/Python Code/' 

In [174]:
# set data parameters
#batch_size = 64           # batch size
#lags = 1                  # number of lagged features in X

In [175]:
# set simulation parameters
#model_name = 'gru'                # 'rnn', 'lstm' or 'gru'
#seed = np.arange(2)              # number of models to run per dataset
#n_epochs = 500                    # number of epochs

In [176]:
# set model/training parameters
#input_dim = lags                  # number of lagged features in X
#hidden_dim = 10                   # number of hidden nodes per layer
#layer_dim = 1                     # number of layers
#output_dim = 1                    # output dimension (1 for univariate output)
#dropout = 0                       # dropout proportion (only before the last sequential layer)
#learning_rate = 1e-3              # learning rate for Adam optimizer
#weight_decay = 1e-6               # weight decay for Adam optimizer

# save model parameters in dict
#model_params = {'input_dim': input_dim, 'hidden_dim' : hidden_dim,'layer_dim' : layer_dim, 'output_dim' : output_dim, 'dropout_prob' : dropout}

In [177]:
#str_sims = ['Ref','Tau0.1Ref','Tau0.1Beta0.5','Tau0.1Beta1.0','Tau0.1Beta2.0','Tau0.2Ref','Tau0.2Beta0.5','Tau0.2Beta1.0','Tau0.2Beta2.0','Tau0.3Ref','Tau0.3Beta0.5','Tau0.3Beta1.0','Tau0.3Beta2.0','Tau0.4Ref','Tau0.4Beta0.5','Tau0.4Beta1.0','Tau0.4Beta2.0','Tau0.5Ref','Tau0.5Beta0.5','Tau0.5Beta1.0','Tau0.5Beta2.0','Tau0.6Ref','Tau0.6Beta0.5','Tau0.6Beta1.0','Tau0.6Beta2.0','Tau0.7Ref','Tau0.7Beta0.5','Tau0.7Beta1.0','Tau0.7Beta2.0'] 

In [None]:
#results, preds, best_seeds = train_loop(model_name=model_name, model_params=model_params,num_sim=1,str_sims=str_sims,path=path,seed=seed,lags=1,batch_size = 64,n_epochs = 500)