In [1]:
# Import libraries
import pandas as pd
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torchvision.models as models
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import time

In [85]:
# Mount onto Google Drive
# from google.colab import drive
# drive.mount('drive')

# # Move to Google Drive and create folder
# %cd /content/drive/My\ Drive/Deep_Learning_Project_Data

In [2]:
for folder_name in ["GRU_Graphs", "LSTM_Graphs"]:
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(folder_name + " Created!")

In [3]:
# Fetch the train/test data from the csv file
train_csv = 'Datasets/train_data.csv'
test_csv = 'Datasets/test_data.csv'

df_train = pd.read_csv(train_csv, index_col = 'Date')
df_test = pd.read_csv(test_csv, index_col = 'Date')

print('successfully fetched train and test datasets')

successfully fetched train and test datasets


In [4]:
# Normalise data using mean and SD from train_df
scaler = StandardScaler()

# Scale the training dataset
scaler.fit(df_train)
scaled_train_df = pd.DataFrame(columns=df_train.columns)
scaled_train_df[scaled_train_df.columns] = scaler.transform(df_train)
scaled_train_df

# Test dataset is scaled later as it requires for the end of the 
# training dataset to be prepended to it before scaling.

scaled_train_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,^NDXT_Close,QTEC_Close,^SP500-45_Close
0,-0.917821,-0.907476,-0.917195,-0.914544,-0.914544,2.194687,-0.837896,-0.840978,-0.832003
1,-0.912616,-0.894926,-0.904533,-0.887362,-0.887362,2.272368,-0.816892,-0.8198,-0.808579
2,-0.889194,-0.888814,-0.887463,-0.880544,-0.880544,1.836779,-0.822993,-0.824687,-0.817037
3,-0.88019,-0.884733,-0.880309,-0.886807,-0.886807,0.928411,-0.820503,-0.820343,-0.813875
4,-0.884076,-0.887523,-0.882058,-0.883476,-0.883476,1.197357,-0.817606,-0.815999,-0.807513


In [5]:
# Hyper parameters
input_size = len(df_train.columns)
hidden_size = 128
num_layers = 1
num_outputs = 1 # Regression problem
epochs = 100
batch_size = 16
learning_rate = 0.001
model_name = 'GRU_model'

In [6]:
# Define dataset class
class StockDataSet (Dataset):
    
    def __init__(self,raw_data,sequence_length):        
        self.target = torch.Tensor([np.array(raw_data['Close']).astype(np.float32)]).T
        self.features = torch.Tensor(np.array(raw_data).astype(np.float32))
        self.seq_len = sequence_length
    
    def __getitem__(self,index): # allows us to index our instance
        
        input_feat = self.features[index:index+ self.seq_len]
        label = self.target[index +self.seq_len]  
        return input_feat,label 
    
    def __len__(self):
        return len(self.features)- self.seq_len

In [7]:
# Define LSTM class
class LSTM_NN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers,num_outputs):
        super(LSTM_NN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
       
         # When set batch_first = True input sample dims shoud be:
         #(batch_size, seq_len , input_size)
        
        if model_name == "GRU_model":
            # For GRU (Gated Recurrent Unit)
            self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        else:
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_outputs)
        
    def forward(self, x):
        # Set initial hidden states and memory states:
        h0 = torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(device) 
        # Note:  x.size()[0] = batch_size

#         
        # or:
    
        if model_name == "GRU_model":
            output, _ = self.gru(x, h0)  
        else:
            output, _ = self.lstm(x, (h0,c0))
        
        # output: tensor of shape (batch_size, seq_length, hidden_size)
        
        # The output of the last cell is of our concern
        output = output[:, -1, :] # output.size : (batch_size, hidden_size)
        
        output = self.fc(output) # output.size: (batch_size , 1)
        
        return output

In [8]:
# Train the model
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs, model_name):
    os.makedirs('checkpoint',exist_ok = True)
    checkpoint_path=f'{model_name}.pth'
    n_total_steps = len(train_loader)
    train_loss_list = []
    test_loss_list = []
    
    # Record the number of times since improvement for early stopping
    num_since_impr = 0
    stop_after = 50 # Stop after 5 epochs if no improvements to loss 
    
    for epoch in range(epochs):
        model.train()
        train_running_loss = 0 
        counter = 0

        for i, (features, labels) in enumerate(train_loader):  

            features = features.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
            train_running_loss+=loss.item() * len(labels)
            counter += len(labels)
        
        train_loss = (train_running_loss/counter)    
        print (f" Epoch [{epoch+1}/{epochs}],"
                   +  f" Training loss: {train_loss :.3f}")
 
        test_loss, _ , _ = test_model(model,test_loader,criterion)
        train_loss_list.append(train_loss)
        test_loss_list.append(test_loss)

        if epoch == 0:
          least_test_loss = test_loss 
        
        elif epoch > 0 and test_loss_list[-1] < least_test_loss:
          least_test_loss = test_loss_list[-1]
          torch.save(model.state_dict(), 'checkpoint'+'/'+ checkpoint_path)
            #print(test_loss_list[-1])
          # Reset counter
          num_since_impr = 0
        else:
            num_since_impr +=1
        # If no
        if num_since_impr >= stop_after:
            print("EARLY STOPPING")
#             return model, train_loss_list, test_loss_list  
            
    return model, train_loss_list, test_loss_list     

In [9]:
# Test the model
def test_model(model, test_loader, criterion):
    model.eval()
    test_running_loss = 0
    counter = 0
    predicted = []
    true_labels = []
    
    with torch.no_grad():
        for (features, labels) in test_loader:
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            test_running_loss += loss.item()*len(labels)
            counter += len(labels)
            predicted.append(outputs.detach().cpu().numpy())
            true_labels.append(labels.detach().cpu().numpy())

        test_loss = (test_running_loss/counter)
        print(f' Test loss of the network: {test_loss:.3f} ')
    
    return test_loss, predicted, true_labels

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Using cuda")
seed = 2022
torch.cuda.manual_seed_all(seed)

Using cuda


In [11]:
seq_len_list = [5, 10, 20, 30, 50, 70]

epochs=1

# Go through each of the sequence lengths
for seq_len in seq_len_list: 
    start_time = time.time()
    print("SEQUENCE LENGTH IS:", str(seq_len))
    
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    # = = = = = = = = = = = = = = = = Set up Data Loaders = = = = = = = = = = = = = = = = 
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    scaled_df_test = pd.DataFrame(columns=df_test.columns)
    # Prepend end of training dataset to the start of test dataset and then scale it
    new_df_test = pd.concat([df_train[-seq_len:].copy(), df_test])
    scaled_df_test[scaled_df_test.columns] = scaler.transform(new_df_test)
    
    # Make a copy of the data
    raw_train_data = scaled_train_df.copy()
    raw_test_data = scaled_df_test.copy()

    # Instantiate train and test dataset objects
    train_dataset = StockDataSet(raw_train_data, seq_len) 
    test_dataset = StockDataSet(raw_test_data, seq_len)
    
    # Define train and test loaders
    train_loader = DataLoader(dataset = train_dataset,
                             batch_size = batch_size,
                             num_workers = 2,
                             shuffle = False) # shuffle= True??

    test_loader = DataLoader(dataset = test_dataset,
                             batch_size = batch_size,
                             num_workers = 2,
                             shuffle = False)
    
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    #  = = = = = = = = = = = = = Set Up and Train the Model = = = = = = = = = = = = = = =
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    
    # Instantiate LSTM model object
    lstm_model = LSTM_NN(input_size, hidden_size, num_layers, num_outputs).to(device)

    # Instantiate loss and optimizer objects:
    criterion = nn.MSELoss() # Not cross entropy because regression problem
    optimizer = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate) 
    
    # RUN THE MODEL
    model, train_loss_list, test_loss_list = train_model(lstm_model, train_loader, test_loader, criterion, optimizer, epochs, model_name)
    
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    #  = = = = = = = = = = = = = = = = Test the Model = = = = = = = = = = = = = = = = = =
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    
    # Predicted vs true stock prices
    best_model = LSTM_NN(input_size, hidden_size, num_layers, num_outputs).to(device)

    # Load best model weights
    best_model.load_state_dict(torch.load('checkpoint'+'/'+ model_name + '.pth'))
    _ , predicted, true_labels = test_model(best_model, test_loader, criterion)

    predicted = np.concatenate(predicted)
    true_labels = np.concatenate(true_labels)

    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    # = = = = = = = = = = = = = = = = = Save Graph Plot = = = = = = = = = = = = = = = = =
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    
    # Revert back to normal
    pred_unscaled = predicted*np.sqrt(scaler.var_[3])+ scaler.mean_[3]
    true_unscaled = true_labels*np.sqrt(scaler.var_[3])+ scaler.mean_[3]
    x_day = range(len(predicted))
    plt.plot(x_day, pred_unscaled, label='Predicted')
    plt.plot(x_day, true_unscaled, label='True')

    plt.title('Predicted vs True Stock Prices',fontsize=14, fontweight='bold')
    plt.xlabel('Day', fontsize=14, fontweight='bold')
    plt.ylabel('Stock Price',fontsize=14, fontweight='bold')
    plt.legend(loc='best')
    
    if model_name == "GRU_model":
        plt.savefig("GRU_Graphs/GRU_Graph_" + str(seq_len) + ".png")
    else:
        plt.savefig("LSTM_Graphs/LSTM_Graph_" + str(seq_len) + ".png")
    
    # Clear plot
    plt.clf()
    
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    # = = = = = = = = = = = = = = = = = Save Metrics = = = = = = = = = = = = = = = = = =
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    
    # Get result for each number of forecast steps
    for fore_step in [10,20,30,40,50,60,753]:
        rmse = mean_squared_error(true_unscaled[:fore_step], pred_unscaled[:fore_step], squared=False)
    
        # Record rmse values in csv file
        print("The RMSE value is:", rmse)
        with open(model_name + ".csv", "a") as res_file:
            res_file.write(str(seq_len) + ", " + str(fore_step)+ ", " + str(rmse)+"\n") # + ",epoch_num:"+str(num_epochs)+"\n")

SEQUENCE LENGTH IS: 5


  self.target = torch.Tensor([np.array(raw_data['Close']).astype(np.float32)]).T


 Epoch [1/1], Training loss: 0.028
 Test loss of the network: 3.529 
 Test loss of the network: 3.669 
The RMSE value is: 32.90625
The RMSE value is: 26.969566
The RMSE value is: 23.67039
The RMSE value is: 23.264822
The RMSE value is: 21.047108
The RMSE value is: 23.354504
The RMSE value is: 550.0472
SEQUENCE LENGTH IS: 10
 Epoch [1/1], Training loss: 0.031
 Test loss of the network: 3.685 
 Test loss of the network: 3.463 
The RMSE value is: 36.032112
The RMSE value is: 29.238943
The RMSE value is: 25.700165
The RMSE value is: 24.426968
The RMSE value is: 22.251534
The RMSE value is: 23.139328
The RMSE value is: 534.36206
SEQUENCE LENGTH IS: 20
 Epoch [1/1], Training loss: 0.028
 Test loss of the network: 3.333 
 Test loss of the network: 3.429 
The RMSE value is: 35.907494
The RMSE value is: 29.149239
The RMSE value is: 25.621
The RMSE value is: 24.37124
The RMSE value is: 22.193316
The RMSE value is: 23.12657
The RMSE value is: 531.7441
SEQUENCE LENGTH IS: 30
 Epoch [1/1], Training

<Figure size 432x288 with 0 Axes>

In [108]:
# # Predicted vs true stock prices
# best_model = LSTM_NN(input_size, hidden_size, num_layers, num_outputs).to(device)

# # Load best model weights
# best_model.load_state_dict(torch.load('checkpoint'+'/'+ model_name + '.pth'))
# _ , predicted, true_labels = test_model(best_model, test_loader, criterion)

# predicted = np.concatenate(predicted)
# true_labels = np.concatenate(true_labels)

# # Revert back to normal
# pred_unscaled = predicted*np.sqrt(scaler.var_[3])+ scaler.mean_[3]
# true_unscaled = true_labels*np.sqrt(scaler.var_[3])+ scaler.mean_[3]
# x_day= range(len(predicted))
# plt.plot(x_day, pred_unscaled, label='predicted')
# plt.plot(x_day, true_unscaled, label='True')

    
# plt.title('Predicted vs True Stock Prices',fontsize=14, fontweight='bold')
# plt.xlabel('Day', fontsize=14, fontweight='bold')
# plt.ylabel('Stock Price',fontsize=14, fontweight='bold')
# plt.legend(loc='best')
# plt.show()