In [29]:
import numpy as np
import sklearn.ensemble
import torch

from matplotlib import pyplot as plt
from lib import model

# Introduction

This notebook provides a brief walkthrough of the public code release for our KDD 2018 paper: Deep Multi-Output Forecasting: Learning to Accurately Predict Blood Glucose Trajectories. The full paper is available via arXiv: https://arxiv.org/abs/1806.05357. We hope to release our glucose data to the general public soon. In the meantime, people interested in blood glucose forecasting may be interested in the recently released OhioT1DM dataset: http://smarthealth.cs.ohio.edu/OhioT1DM-dataset.html.

## Model Types

Our paper considers 8 classes of models:

Shallow Baselines
* Extrapolation
* Recursive Random Forest
* Multi-Output Random Forest

Deep Baselines
* Recursive RNN
* Multi-Output RNN

Our Approaches
* Sequential Multi-Output RNN
* Polynomial Multi-Output RNN
* Polynomial Sequential Multi-Output RNN

We will walk through how we implemented, trained, and evaluated each model

## Shallow Baselines

### Extrapolation

This is a simple linear extrapolation baseline implemented via Numpy. We extrapolate using the last 30 minutes (6 samples as our data was sampled at 5 minute intervals) to predict 30 minutes into the future.

In [45]:
data_tr = np.cumsum(np.random.randn(1000, 16), axis=1)
data_ts = np.cumsum(np.random.randn(100, 10), axis=1)

In [17]:
n_input = 6
horizon = 6
degree = 1
extrap_pred = []
for i in range(len(data_ts)):
    coeffs = np.polynomial.polynomial.polyfit(x=np.arange(n_input), y=data_ts[i][-n_input:], deg=degree)
    extrap_pred.append(np.polyval(p=np.flip(coeffs, axis=0), x=np.arange(horizon)+n_input))

### Recursive and Multi-Output Random Forest

Implemented using scikit-learn. Note the scikit-learn implementation automatically infers output size during the fitting step. 

#### Recursive

In [37]:
rf_rec = sklearn.ensemble.RandomForestRegressor(n_estimators=100, n_jobs=-1)

In [40]:
# Note, for actually training recursive models, you should use all of the data by taking input_size tiles
X_rec_tr = data_tr[:, :10]
y_rec_tr = data_tr[:, 10:11].ravel()

In [41]:
rf_rec.fit(X_rec_tr, y_rec_tr)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [54]:
# recursive prediction
X_mod = data_ts.copy()
p_rec_arr = []
for i in range(6):
    p = rf_rec.predict(X_mod)
    p_rec_arr.append(p.reshape(-1, 1))
    X_mod = np.concatenate((X_mod[:, 1:], p.reshape(-1, 1)), axis=1)

#### Multi-Output

In [61]:
# Note, for actually training recursive models, you should use all of the data by taking input_size tiles
X_mo_tr = data_tr[:, :10]
y_mo_tr = data_tr[:, 10:]

In [62]:
rf_mo = sklearn.ensemble.RandomForestRegressor(n_estimators=100, n_jobs=-1)

In [63]:
rf_mo.fit(X_mo_tr, y_mo_tr)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [65]:
p_mo_arr = rf_mo.predict(data_ts)

## Deep Models
Our deep baselines are all implemented in PyTorch. They are a bit more involved to train. The basic training procedure is outlined in lib/trainer.py in the ExperimentTrainer class. The train_sup function is used to fit the provided model. The use of TensorboardX is not required, but convenient for monitoring losses. The data is assumed to be in the form of a pytorch dataset in the form of lib/glucose_dataset.py (though the specifics can vary greatly).

Note that the dataset code requires precomputed polynomial coefficients for the PolyMO setting. This can be done using Numpy's polyfit function on your training data. 

The cuda flag should be set to True if a GPU is available.

### Recursive Baseline

In [None]:
rec_rnn = model.RecursiveRNN(input_dim=1, output_dim=361, hidden_size=512, depth=2,  cuda=False)

### Multi-Output Baseline

In [None]:
mo_rnn = model.MultiOutputRNN(input_dim=1, output_dim=361, output_len=6, hidden_size=512, depth=2, cuda=False)

### Sequential Multi-Output

In [None]:
seqmo_rnn = model.MultiOutputRNN(input_dim=1, output_dim=361, output_len=6, hidden_size=512, depth=2, cuda=False, sequence=True)

### Polynomial Multi-Output

In [None]:
polymo_rnn = model.MultiOutputRNN(input_dim=1, output_dim=361, output_len=6, hidden_size=512, depth=2, cuda=False, polynomial=True, degree=1)

### Polynomial Sequential Multi-Output

In [None]:
polymo_rnn = model.MultiOutputRNN(input_dim=1, output_dim=361, output_len=6, hidden_size=512, depth=2, cuda=False, sequence=True, polynomial=True, degree=1)