In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import math
import gc

  from .autonotebook import tqdm as notebook_tqdm


# Configuration

### Inputs

In [2]:
process_out_dir = '01_process/out/'

train_data_fpath = process_out_dir + 'train_data.npz'
valid_data_fpath = process_out_dir + 'valid_data.npz'
# not doing any test set stuff until the very, very end

In [3]:
extended_dir = '/caldera/projects/usgs/water/iidd/datasci/lake-temp/lake_ice_prediction/'

process_out_dir = extended_dir + process_out_dir

train_data_fpath = extended_dir + train_data_fpath
valid_data_fpath = extended_dir + valid_data_fpath

### Values

In [4]:
# 10000 is same epochs as first LSTM
epochs = 10000
# different, coarser printing compared to other models that
# early stop much sooner
coarse_epoch_printing = 50

# model hyperparams
random_seed = 4 # change for different 'random' initializations
model_dim = 512
ff_dim = 2048
nheads = 8
n_enc_layers = 6

# data loader hyperparams
bs = 100
shuffle = True
pin_memory = True # supposedly faster for CPU->GPU transfers

# training hyperparams
early_stop_patience = 50

### Outputs

In [5]:
train_out_dir = '02_train/out/'

# note that file names are adjusted with seed value
data_scalars_fpath =  train_out_dir + 'massive_transformer_min_max_scalars_' + str(random_seed) + '_.pt'
model_weights_fpath = train_out_dir + 'massive_transformer_weights_' + str(random_seed) + '_.pth'
train_predictions_fpath = train_out_dir + 'massive_transformer_train_preds_' + str(random_seed) + '_.npy'
valid_predictions_fpath = train_out_dir + 'massive_transformer_valid_preds_' + str(random_seed) + '_.npy'
loss_lists_fpath = train_out_dir + 'massive_transformer_loss_lists_' + str(random_seed) + '_.npz'

In [6]:
data_scalars_fpath = extended_dir + data_scalars_fpath
model_weights_fpath = extended_dir + model_weights_fpath
train_predictions_fpath = extended_dir + train_predictions_fpath
valid_predictions_fpath = extended_dir + valid_predictions_fpath
loss_lists_fpath = extended_dir + loss_lists_fpath

# Import

In [7]:
train_data = np.load(train_data_fpath, allow_pickle = True)
valid_data = np.load(valid_data_fpath, allow_pickle = True)

In [8]:
train_x = train_data['x']
train_y = train_data['y']
train_dates = train_data['dates']
train_DOW = train_data['DOW']
train_variables = train_data['features']

In [9]:
valid_x = valid_data['x']
valid_y = valid_data['y']
valid_dates = valid_data['dates']
valid_DOW = valid_data['DOW']
valid_variables = valid_data['features']

# Prepare data for `torch`

In [10]:
train_y = torch.from_numpy(train_y).float().unsqueeze(2) # adding a feature dimension to Ys
train_x = torch.from_numpy(train_x).float()

valid_y = torch.from_numpy(valid_y).float().unsqueeze(2)
valid_x = torch.from_numpy(valid_x).float()

# min-max scale the data

In [11]:
min_max_scalars = torch.zeros(train_x.shape[2], 2)

for i in range(train_x.shape[2]):
    min_max_scalars[i, 0] = train_x[:, :, i].min()
    min_max_scalars[i, 1] = train_x[:, :, i].max()

In [12]:
for i in range(train_x.shape[2]):
    # scale train set with train min/max
    train_x[:, :, i] = ((train_x[:, :, i] - min_max_scalars[i, 0]) /
                        (min_max_scalars[i, 1] - min_max_scalars[i, 0]))
    # scale valid set with train min/max
    valid_x[:, :, i] = ((valid_x[:, :, i] - min_max_scalars[i, 0]) /
                        (min_max_scalars[i, 1] - min_max_scalars[i, 0]))

# Define a simple transformer (encoder-only)

In [13]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 365):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        pe = torch.moveaxis(pe, [0, 1, 2], [1, 0, 2]) # BATCH FIRST
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [14]:
# recycled model code
class BasicTransformer(nn.Module):
    def __init__(self, input_dim, seq_len, model_dim, nheads, ff_dim, n_encoder_layers):
        super().__init__()
        
        self.embedding = nn.Sequential(
            torch.nn.Linear(input_dim, model_dim),
            torch.nn.ReLU()
        )
        self.pos_encoding = PositionalEncoding(model_dim)
        
        self.mask = (torch.triu(torch.ones((seq_len, seq_len))) == 0)
        self.mask = self.mask.transpose(0, 1)
        self.mask = self.mask.cuda()
        
        encoder_layers = nn.TransformerEncoderLayer(d_model = model_dim,
                                                    nhead = nheads,
                                                    batch_first = True,
                                                    dim_feedforward = ff_dim)
        self.encoder = nn.TransformerEncoder(encoder_layers, n_encoder_layers)
        
        self.dense = nn.Linear(model_dim, 1)
        self.dense_activation = nn.Sigmoid()
        
    def forward(self, x):
        """Assumes x is of shape (batch, sequence, feature)"""
        
        embed = self.embedding(x)
        positioned_embed = self.pos_encoding(embed)
        # mask = src_mask for nn.TransformerEncoder
        encoded = self.encoder(positioned_embed, mask = self.mask)
        out = self.dense_activation(self.dense(encoded))
        
        return out

In [15]:
torch.cuda.is_available(), torch.cuda.device_count()

(True, 1)

In [16]:
# initialize the model with a seed
torch.manual_seed(random_seed)
# initialize random mini batches with numpy seed
np.random.seed(random_seed)

model = BasicTransformer(train_x.shape[2], train_x.shape[1], model_dim, nheads, ff_dim, n_enc_layers).cuda()

# print number of model params
sum(param.numel() for param in model.parameters())

18920961

In [17]:
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())
loss_ls = []
valid_loss_ls = []

In [18]:
train_dataset = []
for i in range(len(train_x)):
    train_dataset.append([train_x[i], train_y[i]])
    
valid_dataset = []
for i in range(len(valid_x)):
    valid_dataset.append([valid_x[i], valid_y[i]])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=bs, shuffle=shuffle, pin_memory=pin_memory)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=bs, shuffle=shuffle, pin_memory=pin_memory)

In [19]:
%%time
worsened_valid_loss_count = 0

for i in range(epochs):
    # for each epoch, perform multiple training updates with the random mini batches of the whole training set
    cur_loss = 0
    for batch_x, batch_y in train_loader:
        # Move data to GPU
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
        # Predict and compute loss
        batch_y_hat = model(batch_x)
        batch_loss = loss_fn(batch_y_hat, batch_y)
        # Improve model
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        # Track set-wide loss
        cur_loss += batch_loss.item() * batch_x.shape[0]/train_x.shape[0]
        
    # Likewise, generate predictions for the entire validation set
    cur_val_loss = 0
    # No gradient to maximize hardware use (and not needed for validation)
    with torch.no_grad():
        # For all random mini batches in the validation set...
        for batch_val_x, batch_val_y in valid_loader:
            # Again, move to GPU
            batch_val_x, batch_val_y = batch_val_x.cuda(), batch_val_y.cuda()
            # Predict and eval loss
            batch_val_y_hat = model(batch_val_x)
            batch_val_loss = loss_fn(batch_val_y_hat, batch_val_y)
            # Track set-wide loss
            cur_val_loss += batch_val_loss.item() * batch_val_x.shape[0]/valid_x.shape[0]
    
    # Store new set-wide losses
    loss_ls.append(cur_loss)
    valid_loss_ls.append(cur_val_loss)
    
    # Early stopping: determine if validation set performance is degrading
    if cur_val_loss > min(valid_loss_ls):
        worsened_valid_loss_count += 1
        # Break after our patience has been exhausted
        if worsened_valid_loss_count == early_stop_patience:
            break
    # Only save model weights if validation set performance is improving
    else:
        worsened_valid_loss_count = 0
        torch.save(model.state_dict(), model_weights_fpath)
        
    # Occasionally print the current state
    if i % coarse_epoch_printing == 0:
        # epoch, current train loss, current valid loss, best valid loss
        print(i, batch_loss.item(), cur_val_loss, min(valid_loss_ls))
        
# Final print of the current state
print(i, cur_loss, cur_val_loss, min(valid_loss_ls))

0 0.6644801497459412 0.6771087334426817 0.6771087334426817
50 0.5309036374092102 0.5237719137582753 0.5181432453879359
100 0.458225280046463 0.46543416777145824 0.45245703254049835
150 0.45827680826187134 0.4599741347278584 0.43024161903811947
200 0.43358466029167175 0.4400907417248491 0.429577471311733
250 0.44539788365364075 0.4593915569485059 0.41843229078189814
300 0.4675615429878235 0.41380347728399025 0.40698383323373555
350 0.4190325140953064 0.43362935733597036 0.40346955493546593
381 0.42458941539128625 0.4119205119041855 0.40346955493546593
CPU times: user 53min 27s, sys: 56min 24s, total: 1h 49min 52s
Wall time: 1h 16min 50s


# Save stuff

In [20]:
batch_loss = None
batch_x = None
batch_y = None
batch_y_hat = None

gc.collect()

torch.cuda.empty_cache()

In [21]:
# Reload the best weights and stop performing dropout
model.load_state_dict(torch.load(model_weights_fpath))
model.eval()

BasicTransformer(
  (embedding): Sequential(
    (0): Linear(in_features=11, out_features=512, bias=True)
    (1): ReLU()
  )
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj

In [22]:
# Not using the data loader is simpler for variable Shuffle=True/False
# (and I implemented this prior to using formal data loaders)
def generate_all_preds_via_batch(x_tensor, batch_size):
    # make empty array for predictions
    y_hat_tensor = torch.zeros([x_tensor.shape[0], x_tensor.shape[1], 1])
    
    # until we use all the possible sequential batches...
    count = 1
    loop_max = int(np.ceil(x_tensor.shape[0] / batch_size))
    for i in range(loop_max):
        min_i = (count-1)*bs
        max_i = count*bs
        # generate batch-sized predictions
        if i != (loop_max - 1):
            with torch.no_grad():
                y_hat_tensor[min_i:max_i] = model(x_tensor[min_i:max_i].cuda()).cpu()
        # or remaining-sized predictions
        else:
            with torch.no_grad():
                y_hat_tensor[min_i:] = model(x_tensor[min_i:].cuda()).cpu()
        # update batch count
        count += 1
        
    return y_hat_tensor

In [23]:
train_y_hat = generate_all_preds_via_batch(train_x, bs)
valid_y_hat = generate_all_preds_via_batch(valid_x, bs)

In [24]:
train_y_hat = train_y_hat.numpy()
valid_y_hat = valid_y_hat.numpy()

In [25]:
np.save(train_predictions_fpath, train_y_hat)
np.save(valid_predictions_fpath, valid_y_hat)

In [26]:
torch.save(min_max_scalars, data_scalars_fpath)
torch.save(model.state_dict(), model_weights_fpath)

In [27]:
data = {'train_loss':loss_ls, 'valid_loss':valid_loss_ls}
np.savez_compressed(loss_lists_fpath, **data)