# A toy example  

In this notebook, we will use vanilla LSTM recurrent neural networks to learn our model.  

*Note: In this notebook, we will use the tensorflow probability library, which needs to be installed as it's not part of tensorflow*

In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp

import sys; sys.path.insert(0, '..')
from data.data_generator import *
from preprocess import *
from window_data import *

tfpl = tfp.layers
tfd = tfp.distributions

## Data  

We get data using the first model (also the simplest). We also only use `10` samples

In [2]:
N = 10

start = time.time()

total = generateData(model1,
        num_data = 10,
        init_sty = 'random',
        times = (0, 20),
        params = {'no. of prey': N, 
    'kappa for prey': 0.5, 
    'attraction of prey a': 1, 
    'repulsion of prey b_1': 1, 
    'repulsion of pred b_2': 0.07, 
    'attraction of pred c': 10, 
    'exponent of dist pred p': 1.2},
        steps = 1000,
        second_order = False,
        method = 'rk2',
        return_vel = False,
        cores = 8,
        flattened=False)
end = time.time()
print(f"Time taken: {end-start} seconds.")

Trying to use multiprocessing...
Multiprocessing successful.
Time taken: 1.225876808166504 seconds.


A plot showing the data

In [3]:
# multiPlot([total[0][1], 20/1000, 10], sample_points =[0,0.5,2,4,6,8,10],
#             axis_lim = None, second_order = False, quiver=True)

The data has the shape `(batch, times, individuals, coordinates)`

In [4]:
data = np.array([total[i][1] for i in range(len(total))])
data.shape

(10, 1000, 21, 2)

We will only use one initial condition for this experiment, as our naive implementation only works well with one time series

In [5]:
data = data[0]
train_ds, valid_ds, test_ds = getDatasets(data, scaling = False, return_ndarray=True)

(800, 21, 2) (100, 21, 2) (100, 21, 2)


For an experiment, change to `input_width=900, label_width=100, shift=100`

In [6]:
window1 = WindowData(input_width=10, label_width=5, shift=5,
                    train_ds=train_ds, val_ds=valid_ds, test_ds=test_ds)
print(window1)

Total window size: 15
Input indices: [0 1 2 3 4 5 6 7 8 9]
Label indices: [10 11 12 13 14]
Label start: 10
Input slice: slice(0, 10, None)
Label slice: slice(10, None, None)


In [7]:
train_ds = window1.make_train()
valid_ds = window1.make_val()
test_ds = window1.make_test()

print(train_ds.element_spec)
print(window1.num_points)

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB

(TensorSpec(shape=(None, 10, 21, 2), dtype=tf.float32, name=None), TensorSpec(shape=(None, 5, 21, 2), dtype=tf.float32, name=None))
21


2022-09-14 23:50:49.859946: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-09-14 23:50:49.860048: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## A naive model

The idea is that for data of shape `(batch, times, individuals)`, we pass to an LSTM layer after `embedding` it in some way (the idea is similar to one-hot encoding of integer/categorical values), where it has output shape `(batch, times, length of concatenated embeddings)`, we can then produce a prediction at a single future time step of the shape `(batch, 1, individuals)`

In [9]:
from rnn import *

model = tf.keras.Sequential([tf.keras.layers.Input(shape=(10,21,2)),
                            embedder((10,21,2), 64, batch_size=32)])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedder (embedder)         (32, 10, 1344)            861504    
                                                                 
Total params: 861,504
Trainable params: 861,504
Non-trainable params: 0
_________________________________________________________________


In [35]:
n = 4
coeffs = tf.constant(np.random.normal(0, 1, int(n*(n+1)/2)), dtype=tf.float64)
lower_diag = tfp.math.fill_triangular(coeffs)
lower_diag


<tf.Tensor: shape=(4, 4), dtype=float64, numpy=
array([[ 1.31127106,  0.        ,  0.        ,  0.        ],
       [-0.46805656, -1.09843779,  0.        ,  0.        ],
       [ 0.99597515, -0.07251197,  0.03541874,  0.        ],
       [ 1.08537411,  0.17744395,  0.07478798,  0.46986297]])>

In [36]:
embedding_size = 64

lstm_model_1 = tf.keras.models.Sequential([
    tf.keras.layers.Input((10,21,2)),
    embedder((10,21,2), embedding_size, batch_size=32),
    tf.keras.layers.LSTM(2*embedding_size, return_sequences=False),
    # 5 outputs for each trajectory as it's bivariate normal
    # there are window1.num_points trajectories (21 in this case)
    # so we need 5*21 outputs at each time step
    # there are window1.label_width time steps (5 in this case)
    # so we have 5*21*5 outputs from Dense layer
    # first two 21 blocks are means, last 21*3 block 
    # form the lower-tril matrix (consecutive 3 for each coord)
    # final output shape should be (batch, 5, 21, 2)
    tf.keras.layers.Dense(5*window1.num_points*5, activation='linear'),
    tf.keras.layers.Reshape((5, window1.num_points, 5)),
    # the loc should be (5, 21, 2)
    # the scale_tril should be (5, 21, 2, 2)
    tfpl.DistributionLambda(lambda x: tfd.MultivariateNormalTriL(
                            loc=x[..., :2], 
                            scale_tril=tfp.math.fill_triangular(x[...,2:])
                            )
                           )
])

lstm_model_1.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedder_9 (embedder)       (32, 10, 1344)            861504    
                                                                 
 lstm_8 (LSTM)               (32, 128)                 754176    
                                                                 
 dense_5 (Dense)             (32, 525)                 67725     
                                                                 
 reshape_7 (Reshape)         (32, 5, 21, 5)            0         
                                                                 
 distribution_lambda (Distri  ((32, 5, 21, 2),         0         
 butionLambda)                (32, 5, 21, 2))                    
                                                                 
Total params: 1,683,405
Trainable params: 1,683,405
Non-trainable params: 0
____________________________________________

Before going on, we take a sample from our dataset and pass it through the model to verify the output shape

In [44]:
for x, y in train_ds.take(1):
    print("The log probability shape is:")
    print(lstm_model_1(x).log_prob(y).shape)
    print("The true value's shape is:")
    print(y.shape)
    print("After reduction the probability shape is:")
    print(tf.reduce_mean(lstm_model_1(x).log_prob(y), axis=1).shape)

The log probability shape is:
(32, 5, 21)
The true value's shape is:
(32, 5, 21, 2)
After reduction the probability shape is:
(32, 21)


We define also a custom loss function that computes the negative log likelihood over the time steps of prediction.  

Recall that we would like to sum over the prediction time steps, which is the second axis in this case

In [12]:
def negLog(y_true, y_pred):
    return -tf.reduce_sum(y_pred.log_prob(y_true), axis=1)

Now we can use `RMSProp` as in the paper to train our model.

In [45]:
lstm_model_1.compile(loss=negLog, optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.003))

In [None]:
lstm_model_1.fit(train_ds, epochs=10, validation_data=valid_ds)