In [1]:
import sys
sys.path.append('..')
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import numpy as np
import xarray as xr
import random as rd
import platform
 
from tool.train_evaluate import Trainer, Evaluator
from tool.dataset import NetCDFDataset
from tool.loss import RMSELoss
from tool.utils import Util

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch import optim
%matplotlib inline

Loading specific settings (some defined by the "run-notebooks.sh" script)

In [3]:
%run settings.py

In [4]:
util = Util(model_name, version=version, prefix=dataset)

os.environ["CUDA_VISIBLE_DEVICES"]=cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

Seed (reproducibility)

In [5]:
seed = 1000
np.random.seed(seed)
rd.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic=True

In [6]:
def init_seed(seed):
    np.random.seed(seed)

Loading the dataset

In [8]:
ds = xr.open_mfdataset(dataset_file)
if small_dataset: #useful for quick experimentation
    ds = ds[dict(sample=slice(0,500))]

In [8]:
train_dataset = NetCDFDataset(ds, test_split=test_split, 
                              validation_split=validation_split)
val_dataset   = NetCDFDataset(ds, test_split=test_split, 
                              validation_split=validation_split, is_validation=True)
test_dataset  = NetCDFDataset(ds, test_split=test_split, 
                              validation_split=validation_split, is_test=True)

In [9]:
print('[X_train] Shape:', train_dataset.X.shape)
print('[y_train] Shape:', train_dataset.y.shape)
print('[X_val] Shape:', val_dataset.X.shape)
print('[y_val] Shape:', val_dataset.y.shape)
print('[X_test] Shape:', test_dataset.X.shape)
print('[y_test] Shape:', test_dataset.y.shape)

[X_train] Shape: torch.Size([32427, 1, 5, 32, 32])
[y_train] Shape: torch.Size([32427, 1, 5, 32, 32])
[X_val] Shape: torch.Size([10810, 1, 5, 32, 32])
[y_val] Shape: torch.Size([10810, 1, 5, 32, 32])
[X_test] Shape: torch.Size([10810, 1, 5, 32, 32])
[y_test] Shape: torch.Size([10810, 1, 5, 32, 32])


In [10]:
params = {'batch_size': batch_size,
          'num_workers': 4, 
          'worker_init_fn': init_seed}

train_loader = DataLoader(dataset=train_dataset, shuffle=True, **params)
val_loader = DataLoader(dataset=val_dataset, shuffle=False, **params)
test_loader = DataLoader(dataset=test_dataset, shuffle=False, **params)

Creating the model

In [11]:
model_bulder = models[model_name]
print(model_bulder)

model = model_bulder(train_dataset.X.shape, model_param['num_layers'], model_param['hidden_dim'], 
                     model_param['kernel_size'], device, dropout_rate)
model.to(device)
criterion = RMSELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, alpha=0.9, eps=1e-6)

<class 'model.stconvs2s.STConvS2S_R'>


Training the model

In [None]:
%%time
checkpoint_filename = util.get_checkpoint_filename()

print('Train on {} samples, validate on {} samples'.format(len(train_dataset), len(val_dataset)))

trainer = Trainer(model, criterion, optimizer, train_loader, val_loader, epochs, 
                  device, util, verbose=True, patience=None, no_stop=True)

train_losses, val_losses = trainer.fit(checkpoint_filename, is_chirps)

Train on 32427 samples, validate on 10810 samples
Epoch: 1/50 - loss: 2.1977 - val_loss: 2.1770
=> Saving a new best
Epoch: 2/50 - loss: 1.9021 - val_loss: 1.9421
=> Saving a new best
Epoch: 3/50 - loss: 1.8563 - val_loss: 1.8128
=> Saving a new best
Epoch: 4/50 - loss: 1.8308 - val_loss: 1.9399
Epoch: 5/50 - loss: 1.8022 - val_loss: 1.7186
=> Saving a new best
Epoch: 6/50 - loss: 1.7846 - val_loss: 1.8335
Epoch: 7/50 - loss: 1.7686 - val_loss: 1.7380
Epoch: 8/50 - loss: 1.7523 - val_loss: 1.8093
Epoch: 9/50 - loss: 1.7418 - val_loss: 1.8017
Epoch: 10/50 - loss: 1.7282 - val_loss: 1.7254
Epoch: 11/50 - loss: 1.7205 - val_loss: 1.6929
=> Saving a new best
Epoch: 12/50 - loss: 1.7123 - val_loss: 1.6816
=> Saving a new best
Epoch: 13/50 - loss: 1.7030 - val_loss: 1.7083
Epoch: 14/50 - loss: 1.6982 - val_loss: 1.6530
=> Saving a new best
Epoch: 15/50 - loss: 1.6930 - val_loss: 1.6654
Epoch: 16/50 - loss: 1.6872 - val_loss: 1.6814
Epoch: 17/50 - loss: 1.6826 - val_loss: 1.7317
Epoch: 18/50 

In [None]:
util.save_loss(train_losses, val_losses)
util.plot([train_losses, val_losses], ['Training', 'Validation'], 
          'Epochs', 'Loss', 'Learning curve', inline=True)

Loading and Evaluating the model

In [None]:
test_rmse, test_mae = None, None
evaluator = Evaluator(model, criterion, optimizer, test_loader, device, util, step)
best_epoch, val_loss = evaluator.load_checkpoint(checkpoint_filename)
if not(only_training):
    test_rmse, test_mae = evaluator.eval(is_chirps=is_chirps)
    print(f'Test RMSE: {test_rmse:.4f}')
    print(f'Test MAE: {test_mae:.4f}')

In [None]:
if (torch.cuda.is_available()):
    torch.cuda.empty_cache()

Notification via email

In [None]:
model_info = {'Notebook': 'yes',
              'best_epoch': best_epoch,
              'val_error': val_loss,
              'test_rmse': test_rmse,
              'test_mae': test_mae,
              'step': step,
              'num_layers': model_param['num_layers'],
              'kernel_size': model_param['kernel_size'],
              'hidden_dim': model_param['hidden_dim'],
              'dropout_rate': dropout_rate,
              'learning_rate': lr,
              'dataset': dataset,
              'hostname': platform.node()}

In [None]:
#util.send_email(model_info)