In [None]:
# Adapted from Robert Guthrie https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
# And: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
import sklearn
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import json
import glob
import h5py
import time

np.random.seed(55)
torch.manual_seed(25)
from temperature_scaling import ModelWithTemperature

In [None]:
if torch.cuda.is_available():  
    dev = 'cuda'
else:  
    dev = 'cpu'

device = torch.device(dev)  

In [None]:
all_cols = ['precip', 'temp', 'vpd', 'acres', 'winddevyr', 'meanwetfrq', 'cluster',
       'healthy', 'farmed', 'hydromod', 'fttoroad', 'sthick2013', 'lcf13',
       'lcf11', 'lcf14', 'lcf15', 'lcf2', 'lcf7', 'lcf6', 'lcf1', 'lcf12',
       'lcf9', 'lcf16', 'lcf8', 'lcf10', 'lcf3']

In [None]:
# Column selection
select_cols = ['precip', 'temp', 'vpd', 'acres',
                'cluster','healthy', 'farmed', 'hydromod',
               'fttoroad', 'meanwetfrq', 'sthick2013', # 'winddevyr',
               'lcf13','lcf11', 'lcf14', 'lcf15', 'lcf2', 'lcf7', 'lcf6', 'lcf1', 
               'lcf12', 'lcf9', 'lcf16', 'lcf8', 'lcf10', 'lcf3']
which_cols_from_hdf = np.isin(all_cols, select_cols)

# Output model dict path:
model_path = './model_weights.pytorch'

In [None]:
# Set the number of playas for modelling
num_playas = None # None means all playas will be used

hidden_dim = 128
id_embed_dim = 16
huc_embed_dim = 8
author_embed_dim = 4
num_layers=1

batch_size = 64

early_stopping=16 

# Input hdf file, have it set to direct mounted ssd on AWS EC2 but can be changed
hdf_path = '/data/all_prepped_data.h5'

# These are hard coded, the torch embedding layers need to know the max value they can expect
max_id = 71852
max_author = 4
max_huc = 140
lstm_input_size = len(select_cols)

# Set Up Dataloader 

In [None]:
f = h5py.File(hdf_path, 'r')

# Get some params for reshaping
n_playas, train_seq_len = f['train_num'].shape[:2]
val_seq_len = f['val_num'].shape[1]
test_seq_len =  f['test_num'].shape[1]

# Load data and run scaler simultaneously, requires some reshaping to 2d and back to 3d
scaler = StandardScaler()
train_num = scaler.fit_transform(
    f['train_num'][()][:,:,which_cols_from_hdf].reshape(
        [n_playas*train_seq_len, lstm_input_size])
    ).reshape([n_playas, train_seq_len, lstm_input_size])
train_cat = f['train_cat'][()].astype(int)
train_y = f['train_y'][()]
val_num = scaler.transform(
    f['val_num'][()][:,:,which_cols_from_hdf].reshape(
        [n_playas*val_seq_len, lstm_input_size])
    ).reshape([n_playas, val_seq_len, lstm_input_size])
val_cat = f['val_cat'][()].astype(int)
val_y = f['val_y'][()]
test_num = scaler.transform(
    f['test_num'][()][:,:,which_cols_from_hdf].reshape(
        [n_playas*test_seq_len, lstm_input_size])
    ).reshape([n_playas, test_seq_len, lstm_input_size])
test_cat = f['test_cat'][()].astype(int)
test_y = f['test_y'][()]

In [None]:
train_val_test_ds = torch.utils.data.TensorDataset(
    torch.Tensor(train_num), torch.Tensor(train_cat).long(), torch.Tensor(train_y),
    torch.Tensor(val_num), torch.Tensor(val_cat).long(), torch.Tensor(val_y),
    torch.Tensor(test_num), torch.Tensor(test_cat).long(), torch.Tensor(test_y)
    )
train_val_test_loader = torch.utils.data.DataLoader(
    train_val_test_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False)

In [None]:
orig_model = ... # create an uncalibrated model somehow
valid_loader = ... # Create a DataLoader from the SAME VALIDATION SET used to train orig_model



### Model calibration