In [None]:
# Adapted from Robert Guthrie https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
# And: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
import sklearn
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import json
import glob
import h5py
import time

np.random.seed(55)
torch.manual_seed(25)

In [None]:
if torch.cuda.is_available():  
    dev = 'cuda'
else:  
    dev = 'cpu'

device = torch.device(dev)  

In [None]:
all_cols = ['precip', 'temp', 'vpd', 'acres', 'winddevyr', 'meanwetfrq', 'cluster',
       'healthy', 'farmed', 'hydromod', 'fttoroad', 'sthick2013', 'mean_inun','lcf13',
       'lcf11', 'lcf14', 'lcf15', 'lcf2', 'lcf7', 'lcf6', 'lcf1', 'lcf12',
       'lcf9', 'lcf16', 'lcf8', 'lcf10', 'lcf3']

# Set Params

In [None]:
# Column selection
select_cols = ['precip', 'temp', 'vpd', 'acres',
                'cluster','healthy', 'farmed', 'hydromod',
               'fttoroad',  'sthick2013', 'mean_inun', #'meanwetfrq', 'winddevyr', 
               'lcf13','lcf11', 'lcf14', 'lcf15', 'lcf2', 'lcf7', 'lcf6', 'lcf1', 
               'lcf12', 'lcf9', 'lcf16', 'lcf8', 'lcf10', 'lcf3']
which_cols_from_hdf = np.isin(all_cols, select_cols)

# Output model dict path:
model_path = './model_weights_earlyshift_morereg.pytorch'

In [None]:
# Set the number of playas for modelling
num_playas = None # None means all playas will be used

hidden_dim = 128
id_embed_dim = 16
huc_embed_dim = 8
author_embed_dim = 4
num_layers=1

batch_size = 64

early_stopping=16 

# Input hdf file, have it set to direct mounted ssd on AWS EC2 but can be changed
hdf_path = '../data/all_prepped_data_earlyshift.h5'

# These are hard coded, the torch embedding layers need to know the max value they can expect
max_id = 71852
max_author = 4
max_huc = 140
lstm_input_size = len(select_cols)

# Set Up Dataloader

In [None]:
f = h5py.File(hdf_path, 'r')

# Get some params for reshaping
n_playas, train_seq_len = f['train_num'].shape[:2]
val_seq_len = f['val_num'].shape[1]
test_seq_len =  f['test_num'].shape[1]

# Load data and run scaler simultaneously, requires some reshaping to 2d and back to 3d
scaler = StandardScaler()
train_num = scaler.fit_transform(
    f['train_num'][()][:,:,which_cols_from_hdf].reshape(
        [n_playas*train_seq_len, lstm_input_size])
    ).reshape([n_playas, train_seq_len, lstm_input_size])
train_cat = f['train_cat'][()].astype(int)
train_y = f['train_y'][()]
val_num = scaler.transform(
    f['val_num'][()][:,:,which_cols_from_hdf].reshape(
        [n_playas*val_seq_len, lstm_input_size])
    ).reshape([n_playas, val_seq_len, lstm_input_size])
val_cat = f['val_cat'][()].astype(int)
val_y = f['val_y'][()]
test_num = scaler.transform(
    f['test_num'][()][:,:,which_cols_from_hdf].reshape(
        [n_playas*test_seq_len, lstm_input_size])
    ).reshape([n_playas, test_seq_len, lstm_input_size])
test_cat = f['test_cat'][()].astype(int)
test_y = f['test_y'][()]

In [None]:
train_val_test_ds = torch.utils.data.TensorDataset(
    torch.Tensor(train_num), torch.Tensor(train_cat).long(), torch.Tensor(train_y),
    torch.Tensor(val_num), torch.Tensor(val_cat).long(), torch.Tensor(val_y),
    torch.Tensor(test_num), torch.Tensor(test_cat).long(), torch.Tensor(test_y)
    )
train_val_test_loader = torch.utils.data.DataLoader(
    train_val_test_ds,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False)

# Prep Model

In [None]:
# Here we define our model as a class
class LSTM(nn.Module):

    def __init__(self,
                 input_dim, hidden_dim, num_layers, output_dim,
                 id_embed_dim, max_id, huc_embed_dim, max_huc, author_embed_dim, max_author,
                 device):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim

        self.id_embed_dim = id_embed_dim
        self.max_id = max_id
        self.huc_embed_dim = huc_embed_dim
        self.max_huc = max_huc
        self.author_embed_dim = author_embed_dim
        self.max_author = max_author
        
        self.full_input_dim = self.input_dim + self.id_embed_dim + self.huc_embed_dim + self.author_embed_dim
        
        self.device = device
        
        # Define embedding layers
        self.id_embedding = nn.Embedding(self.max_id+1, self.id_embed_dim).to(self.device)
        self.huc_embedding = nn.Embedding(self.max_huc+1, self.huc_embed_dim).to(self.device)
        self.author_embedding = nn.Embedding(self.max_author+1, self.author_embed_dim).to(self.device)

        # Define the LSTM layer
        self.lstm = nn.LSTM(self.full_input_dim, self.hidden_dim, self.num_layers, batch_first=True).to(self.device)

        # Define activations for output
        self.linear = nn.Linear(self.hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        

    def init_hidden(self, batch_size):
        self.h = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
        self.c = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
        return

    
    def forward(self, input_numeric, playa_ids, hucs, auths):
        # Forward pass through LSTM layer
        # shape of input: [batch_size, timesteps, input_dims]
        # shape of lstm_out: [batch_size, timesteps, hidden_dim]
        # shape of self.hidden: (a, b), where a and b both 
        # have shape (num_layers, batch_size, hidden_dim).
        # Shape of y_pred: [batch_size, timesteps, 4]
        # Run ids through embedding layer        
        # Concat and run through LSTM
        
        
        # Check that hidden layers have expected shape
        assert self.h.shape == torch.Size([self.num_layers, input_numeric.size(0), self.hidden_dim])
        assert self.c.shape == torch.Size([self.num_layers, input_numeric.size(0), self.hidden_dim])

        # Run categorical data through embeddings
        self.id_emb= self.id_embedding(playa_ids.to(self.device))
        self.huc_emb= self.huc_embedding(hucs.to(self.device))
        self.author_emb= self.author_embedding(auths.to(self.device))


        # Concat embedding and inputs and run through LSTM
        lstm_out, (self.h, self.c) = self.lstm(
            torch.cat((input_numeric, self.id_emb, self.huc_emb, self.author_emb), 2),
            (self.h, self.c)
        )
        
        # Assert that shapes are still as expected
        assert self.h.shape == torch.Size([self.num_layers, input_numeric.size(0), self.hidden_dim])
        assert self.c.shape == torch.Size([self.num_layers, input_numeric.size(0), self.hidden_dim])
        assert lstm_out.shape == torch.Size([input_numeric.size(0), input_numeric.size(1), self.hidden_dim]) # batch, seq_len, hidden

        
        # Run activation and get outputs
        lin_act = self.linear(lstm_out)
        y_pred = self.sigmoid(lin_act) 

        # Check that outputs are expected shape [batch_size, seq_len, 1]
        assert lin_act.shape == torch.Size([input_numeric.size(0), input_numeric.size(1), 1])
        assert lin_act.shape == y_pred.shape

        return y_pred


model = LSTM(input_dim = lstm_input_size,
             hidden_dim=hidden_dim,
             num_layers=num_layers,
             output_dim=1,
             id_embed_dim = id_embed_dim,
             max_id = max_id,
             huc_embed_dim = huc_embed_dim,
             max_huc = max_huc,
             author_embed_dim = author_embed_dim,
             max_author = max_author,
             device=device)

if dev == 'cuda':
    model.cuda()

# Run model

In [None]:
model.load_state_dict(torch.load(model_path))
model.eval()

In [None]:
bceloss = torch.nn.BCELoss()   

#####################---------------------------------------------------------------------------
# Run model
#####################
y_ordered = []
all_pred = []
all_cats_list = []
for (x_batch, cat_batch, y_batch,
     val_x_batch, val_cat_batch, val_y_batch,
     test_x_batch, test_cat_batch, test_y_batch) in train_val_test_loader: 
    all_x_batch = torch.cat([x_batch, val_x_batch, test_x_batch], 1).to(device)
    all_cat_batch = torch.cat([cat_batch, val_cat_batch, test_cat_batch], 1).to(device)
    all_y_batch = torch.cat([y_batch, val_y_batch, test_y_batch],1).to(device)
            
    # Get groundtruth in shuffle order
    y_ordered.append(all_y_batch.view(-1).detach())
    
    # Get categorical variables to help with analysis
    all_cats_list.append(all_cat_batch.view(-1, 3).detach())

    model.init_hidden(batch_size=x_batch.size(0))

    # Training: Predict and calc loss
    pred = model(all_x_batch, all_cat_batch[:,:,0], all_cat_batch[:,:,1], all_cat_batch[:,:,2])
        
    all_pred.append(pred.view(-1).detach())
    
        
best_pred = torch.cat(all_pred, dim=0).cpu().detach().numpy()
# Save ordered ground truth
y_ordered = torch.cat(y_ordered).cpu().detach().numpy()

all_cats = torch.cat(all_cats_list).cpu().detach().numpy()


In [None]:
bceloss(torch.tensor(best_pred), torch.tensor(y_ordered))

# Save as csv

In [None]:
pred_df = pd.DataFrame({
    'true':y_ordered.astype(np.int8),
    'pred':best_pred,
    'id':all_cats[:,0].astype(np.uint32),
})

In [None]:
pred_df.to_csv('./all_preds_earlyshift_morereg.csv', index=False)