# Time Series Prediction with LSTM Using PyTorch
In this notebook we will use PyTorch to build an LSTM model to help us forecast the number of traffic collisions.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler

import boto3
import awswrangler
# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir

path =  Path(os.getcwd())
root = path.parent.absolute()

root

## 1. Connect to AWS Services

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

## 2. Import data from S3
AWS Wrangler is used to read all files in the S3 Bucket with a .csv suffix into a single Pandas dataframe.

In [None]:
raw_path_dir = 'TIMS_raw_crashes/'

# path of S3 bucket where collision data is stored
raw_path = f"s3://{s3_bucket}/{raw_path_dir}"

# read data from S3 bucket
collision_df = awswrangler.s3.read_csv(path=raw_path, path_suffix=['.csv'], dataset=True,
                                 boto3_session=my_session, use_threads=True, low_memory=False)

In [None]:
collision_df.head()

In [None]:
print('Number of collisions:', collision_df.shape[0])

In [None]:
sorted(collision_df['ACCIDENT_YEAR'].unique().tolist())

We can see that our dataset contains records for years 2014 to 2021.

## 3. Data preprocessing

In [None]:
collision_df[['ACCIDENT_YEAR','COLLISION_DATE']].dtypes

Convert collision date to datetime type and extract the year and month.

In [None]:
collision_df['COLLISION_DATE'] = collision_df['COLLISION_DATE'].astype('datetime64[ns]')
collision_df['Year-Month'] = collision_df['COLLISION_DATE'].dt.strftime('%Y-%m')
collision_df['Year-Week'] = collision_df['COLLISION_DATE'].dt.strftime('%Y-%U')

Next, we group by the collision date to count the number of accidents by year-month.

In [None]:
df = collision_df.groupby(['Year-Week','ACCIDENT_YEAR'], as_index=False)['CASE_ID'].count()
df.rename(columns={'CASE_ID':'Accidents','ACCIDENT_YEAR':'Accident_Year'},inplace=True)

In [None]:
df.head()

In [None]:
df.shape

Let's create a simple chart to visualize the number of monthly collisions over time.

In [None]:
alt.Chart(df).mark_line().encode(
    x=alt.X('Year-Week:T', axis=alt.Axis(grid=False)),
    y=alt.Y('Accidents:Q', axis=alt.Axis(grid=False))
).properties(width=600, title='2014 to 2021 Collisions')

In [None]:
# training_set = df[df['Accident_Year'] <= 2019]
# test_set = df[df['Accident_Year'] >= 2020]

### Select features and perform feature scaling

In [None]:
# training_data = df['Accidents'].values.reshape(-1, 1)

training_data = df.iloc[:,2:3].values

# scale features
sc = MinMaxScaler()
training_data = sc.fit_transform(training_data)

In [None]:
len(training_data)

### Define a sliding window

In [None]:
def sliding_windows(data, seq_length):
    x = []
    y = []

    for i in range(len(data)-seq_length-1):
        _x = data[i:(i+seq_length)]
        _y = data[i+seq_length]
        x.append(_x)
        y.append(_y)

    return np.array(x),np.array(y)

### Split data into train and test sets
The training set will contain years 2014 through 2019 and the test set will serve as the held-out test validation set by containing years 2020 and 2021.  There are 72 months in years 2014 through 2019 so the training set will be based on the first 72 values.

## 4-week window

In [None]:
seq_length = 4
# apply sliding window
x, y = sliding_windows(training_data, seq_length)

# split the data into train test sets
train_size = int(len(y) * 0.74)
test_size = len(y) - train_size

In [None]:
print('Number of months in training set:', train_size)
print('Number of months in test set:', test_size)

### Convert training data to tensors

In [None]:
dataX = Variable(torch.Tensor(np.array(x)))
dataY = Variable(torch.Tensor(np.array(y)))

trainX = Variable(torch.Tensor(np.array(x[0:train_size])))
trainY = Variable(torch.Tensor(np.array(y[0:train_size])))

testX = Variable(torch.Tensor(np.array(x[train_size:len(x)])))
testY = Variable(torch.Tensor(np.array(y[train_size:len(y)])))

## 4. Build LSTM model

### Define the LSTM Model

In [None]:
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))
        
        c_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))
        
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        
        h_out = h_out.view(-1, self.hidden_size)
        
        out = self.fc(h_out)
        
        return out

## 5. Model training
### 5.1 Train model

Check if GPU is available

In [None]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

In [None]:
num_epochs = 8000
learning_rate = 0.001

input_size = 1
hidden_size = 2
num_layers = 1

num_classes = 1

lstm = LSTM(num_classes, input_size, hidden_size, num_layers)

criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    outputs = lstm(trainX)
    optimizer.zero_grad()
    
    # obtain the loss function
    loss = criterion(outputs, trainY)
    
    loss.backward()
    
    optimizer.step()
    if epoch % 100 == 0:
      print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

### 5.2 Generate predictions and evaluate model performance

In [None]:
lstm.eval()
train_predict = lstm(dataX)

data_predict = train_predict.data.numpy()
actuals_plot = dataY.data.numpy()

data_predict = sc.inverse_transform(data_predict)
actuals_plot = sc.inverse_transform(actuals_plot)

In [None]:
print(len(data_predict))

In [None]:
print(len(actuals_plot))

In [None]:
plt.figure(figsize=(20,8))
plt.axvline(x=train_size, c='blue', linestyle='--')
plt.plot(actuals_plot, label='Acutals')
plt.plot(data_predict, label='Predictions')
plt.legend(loc="upper right")
plt.ylim(0, 1500)
plt.title('Time-Series Prediction')
plt.show()