In [21]:
import sys
sys.path.insert(0, '/users/junzheyin/iris/src')
print(sys.path)

['/users/junzheyin/iris/src', '/users/junzheyin/iris/src', '/users/junzheyin/iris/src/models', '/users/junzheyin/anaconda3/envs/iris/lib/python38.zip', '/users/junzheyin/anaconda3/envs/iris/lib/python3.8', '/users/junzheyin/anaconda3/envs/iris/lib/python3.8/lib-dynload', '', '/users/junzheyin/.local/lib/python3.8/site-packages', '/users/junzheyin/anaconda3/envs/iris/lib/python3.8/site-packages']


In [22]:
from typing import Any, Optional, Tuple

from einops import rearrange
import torch
import torch.nn as nn
import torch.nn.functional as F

import hydra
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf
import torch
import tqdm
import torch.nn as nn
from tqdm import tqdm

from dataset import Batch
from models.kv_caching import KeysValues
from models.slicer import Embedder, Head
from models.tokenizer import Tokenizer
from models.transformer import Transformer, TransformerConfig
from utils import init_weights, LossWithIntermediateLosses

In [23]:
# Define dataset
import torch
import sys

import h5py
from PIL import Image
#import matplotlib.pyplot as plt
from datetime import datetime, timedelta
def eventGeneration(start_time, obs_time = 3 ,lead_time = 6, time_interval = 30):
    # Generate event based on starting time point, return a list: [[t-4,...,t-1,t], [t+1,...,t+72]]
    # Get the start year, month, day, hour, minute
    year = int(start_time[0:4])
    month = int(start_time[4:6])
    day = int(start_time[6:8])
    hour = int(start_time[8:10])
    minute = int(start_time[10:12])
    #print(datetime(year=year, month=month, day=day, hour=hour, minute=minute))
    times = [(datetime(year, month, day, hour, minute) + timedelta(minutes=time_interval * (x+1))) for x in range(lead_time)]
    lead = [dt.strftime('%Y%m%d%H%M') for dt in times]
    times = [(datetime(year, month, day, hour, minute) - timedelta(minutes=time_interval * x)) for x in range(obs_time)]
    obs = [dt.strftime('%Y%m%d%H%M') for dt in times]
    obs.reverse()
    return lead, obs

from torch.utils.data import Dataset, DataLoader
import h5py
import numpy as np
from torchvision.transforms import ToTensor, Compose, CenterCrop
class radarDataset(Dataset):
    def __init__(self, root_dir, event_times, obs_number = 3, pred_number = 6, transform=None):
        # event_times is an array of starting time t(string)
        # transform is the preprocessing functions
        self.root_dir = root_dir
        self.transform = transform
        self.event_times = event_times
        self.obs_number = obs_number
        self.pred_number = pred_number
    def __len__(self):
        return len(self.event_times)
    def __getitem__(self, idx):
        start_time = str(self.event_times[idx])
        time_list_pre, time_list_obs = eventGeneration(start_time, self.obs_number, self.pred_number)
        output = []
        time_list = time_list_obs + time_list_pre
        #print(time_list)
        for time in time_list:
            year = time[0:4]
            month = time[4:6]
            #path = self.root_dir + year + '/' + month + '/' + 'RAD_NL25_RAC_MFBS_EM_5min_' + time + '_NL.h5'
            path = self.root_dir + year + '/' + month + '/' + 'RAD_NL25_RAP_5min_' + time + '.h5'
            image = np.array(h5py.File(path)['image1']['image_data'])
            #image = np.ma.masked_where(image == 65535, image)
            image = image[264:520,242:498]
            image[image == 65535] = 0
            image = image.astype('float32')
            image = image/100*12
            image = np.clip(image, 0, 128)
            image = image/40
            #image = 2*image-1 #normalize to [-1,1]
            output.append(image)
        output = torch.permute(torch.tensor(np.array(output)), (1, 2, 0))
        output = self.transform(np.array(output))
        return output
#root_dir = '/users/hbi/data/RAD_NL25_RAC_MFBS_EM_5min/'
#dataset = radarDataset(root_dir, ["200808031600"], transform = Compose([ToTensor(),CenterCrop(256)]))

In [24]:
# develop dataset
from torch.cuda.amp import autocast
#from torch.autograd import Variable
import pandas as pd
root_dir = '/home/hbi/RAD_NL25_RAP_5min/' 

df_train = pd.read_csv('/users/hbi/taming-transformers/training_Delfland08-14_20.csv', header = None)
event_times = df_train[0].to_list()
dataset_train = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))  

df_train_s = pd.read_csv('/users/hbi/taming-transformers/training_Delfland08-14.csv', header = None)
event_times = df_train_s[0].to_list()
dataset_train_del = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))  

df_test = pd.read_csv('/users/hbi/taming-transformers/testing_Delfland18-20.csv', header = None)
event_times = df_test[0].to_list()
dataset_test = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))

df_vali = pd.read_csv('/users/hbi/taming-transformers/validation_Delfland15-17.csv', header = None)
event_times = df_vali[0].to_list()
dataset_vali = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))

df_train_aa = pd.read_csv('/users/hbi/taming-transformers/training_Aa08-14.csv', header = None)
event_times = df_train_aa[0].to_list()
dataset_train_aa = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))  

df_train_dw = pd.read_csv('/users/hbi/taming-transformers/training_Dwar08-14.csv', header = None)
event_times = df_train_dw[0].to_list()
dataset_train_dw = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))    

df_train_re = pd.read_csv('/users/hbi/taming-transformers/training_Regge08-14.csv', header = None)
event_times = df_train_re[0].to_list()
dataset_train_re = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))   

data_list = [dataset_train_aa, dataset_train_dw, dataset_train_del, dataset_train_re]
train_aadedwre = torch.utils.data.ConcatDataset(data_list)

print(len(dataset_train), len(dataset_test), len(dataset_vali))
loaders = { 'train' :DataLoader(train_aadedwre, batch_size=1, shuffle=True, num_workers=8),
            'test' :DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=8), 
           'valid' :DataLoader(dataset_vali, batch_size=1, shuffle=False, num_workers=8),
          
          'train_aa5' :DataLoader(dataset_train_aa, batch_size=1, shuffle=False, num_workers=8),
          'train_dw5' :DataLoader(dataset_train_dw, batch_size=1, shuffle=False, num_workers=8),
          'train_del5' :DataLoader(dataset_train_del, batch_size=1, shuffle=True, num_workers=8),
          'train_re5' :DataLoader(dataset_train_re, batch_size=1, shuffle=False, num_workers=8),
          }

32183 3493 3560


In [28]:
from utils import configure_optimizer, EpisodeDirManager, set_seed
import hydra
from omegaconf import OmegaConf
config =OmegaConf.load('/users/junzheyin/iris/config/trainer.yaml')
cfg=config
# Access the configuration and perform further operations
# For example, print the contents of the configuration
print(config)


if config.common.seed is not None:
        set_seed(config.common.seed)

print(set_seed)


@hydra.main(config_path='/users/junzheyin/iris/config/', config_name='trainer.yaml')
def config_function(cfg):
    # Access the configuration
    config_file = OmegaConf.load(cfg)
  
    return config_file


cfg_worldmodel = config_function('/users/junzheyin/iris/config/world_model/default.yaml')
cfg_tokenizer = config_function('/users/junzheyin/iris/config/tokenizer/default.yaml')


{'defaults': ['_self_', {'tokenizer': 'default'}, {'world_model': 'default'}, {'actor_critic': 'default'}, {'env': 'default'}, {'datasets': 'default'}], 'wandb': {'mode': 'disabled', 'project': 'iris', 'entity': None, 'name': None, 'group': None, 'tags': None, 'notes': None}, 'initialization': {'path_to_checkpoint': None, 'load_tokenizer': False, 'load_world_model': False, 'load_actor_critic': False}, 'common': {'epochs': 600, 'device': 'cuda:0', 'do_checkpoint': True, 'seed': 0, 'sequence_length': '${world_model.max_blocks}', 'resume': False}, 'collection': {'train': {'num_envs': 1, 'stop_after_epochs': 500, 'num_episodes_to_save': 10, 'config': {'epsilon': 0.01, 'should_sample': True, 'temperature': 1.0, 'num_steps': 200, 'burn_in': '${training.actor_critic.burn_in}'}}, 'test': {'num_envs': 8, 'num_episodes_to_save': '${collection.train.num_episodes_to_save}', 'config': {'epsilon': 0.0, 'should_sample': True, 'temperature': 0.5, 'num_episodes': 16, 'burn_in': '${training.actor_critic

In [29]:
device = torch.device(cfg.common.device)
print(device)


cuda:0


In [30]:
from collections import defaultdict
from functools import partial
from pathlib import Path
import shutil
import sys
import time
from typing import Any, Dict, Optional, Tuple

import hydra
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf
import torch
import tqdm
import torch.nn as nn
from tqdm import tqdm

from agent import Agent
from collector import Collector
from make_reconstructions import make_reconstructions_from_batch

from utils import configure_optimizer, EpisodeDirManager, set_seed
from torch.utils.data import DataLoader
from datetime import datetime

In [32]:
from models.transformer import Transformer, TransformerConfig

block_mask = torch.cat([torch.ones(768), torch.zeros(1536)])

config = TransformerConfig(tokens_per_block=768, max_blocks=3, attention="causal", num_layers=6, num_heads=8, embed_dim=256, embed_pdrop=0.1, resid_pdrop=0.1, attn_pdrop=0.1)

transformer = Transformer(config)
head_observations = nn.Linear(256, 1024)
print(head_observations)
tokenizer = instantiate(cfg_tokenizer)
            



Linear(in_features=256, out_features=1024, bias=True)
Tokenizer : shape of latent is (256, 16, 16).


In [34]:
print(transformer)

Transformer(
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): SelfAttention(
        (key): Linear(in_features=256, out_features=256, bias=True)
        (query): Linear(in_features=256, out_features=256, bias=True)
        (value): Linear(in_features=256, out_features=256, bias=True)
        (attn_drop): Dropout(p=0.1, inplace=False)
        (resid_drop): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=256, out_features=256, bias=True)
      )
      (mlp): Sequential(
        (0): Linear(in_features=256, out_features=1024, bias=True)
        (1): GELU()
        (2): Linear(in_features=1024, out_features=256, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
    (1): Block(
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((256,), eps=1e

In [18]:
from utils import configure_optimizer, EpisodeDirManager, set_seed
from models.world_model import WorldModel
tokenizer = instantiate(cfg_tokenizer)

world_model = WorldModel(obs_vocab_size=tokenizer.vocab_size,config=instantiate(cfg_worldmodel))
world_model.to(device)

optimizer_trans = configure_optimizer(world_model, cfg.training.learning_rate, cfg.training.world_model.weight_decay)


Tokenizer : shape of latent is (256, 16, 16).


In [19]:
ckpt_opt = torch.load('/space/junzheyin/vqvae_checkpoint_epoch58', map_location=device)

# Load model's state dict
model_state_dict = ckpt_opt['model_state_dict']
tokenizer.load_state_dict(model_state_dict)
tokenizer.to(device)



tokenizer

In [20]:
for epoch in range(10):
    save_epoch = epoch in [0,2,4,9,19,29,39,49,59,79]
    #save_epoch = epoch in [2]
    loss_total_epoch=0.0
    optimizer_trans.zero_grad()
    intermediate_losses = {}
    print("epoch {}".format(epoch)) 
    loss_total_step=0
    
    i=0
    for images in loaders['train']:
        input_image= images.to(device) 

        losses = world_model.compute_loss(input_image,tokenizer)
        loss_total_step += losses.loss_total 
        loss_total_epoch += loss_total_step.item()
        if (i+1) % 16 == 0:
            (loss_total_step/16).backward()
            optimizer_trans.step()
            optimizer_trans.zero_grad()
            print("Losses: Total = {:.4f}".format(loss_total_step.item()))
        
        for loss_name, loss_value in losses.intermediate_losses.items():
            intermediate_losses[f"{str(world_model)}/train/{loss_name}"] = loss_value/16
    
        
    metrics = {f'{str(world_model)}/train/total_loss': loss_total_epoch, **intermediate_losses}
    print("Epoch {}: Total Loss = {:.4f}".format(epoch, metrics[f'{str(world_model)}/train/total_loss']))

    if epoch in save_epoch:
        torch.save({
        'model_state_dict': world_model.state_dict(),
        'optimizer_state_dict': optimizer_trans.state_dict(),
        }, '/space/zboucher/iris_1/src/checkpoint/transformer_{}'.format(epoch+1))



epoch 0


RuntimeError: The size of tensor a (576) must match the size of tensor b (768) at non-singleton dimension 1

In [None]:
class Trans_Trainer:
    def __init__(self, cfg_tokenizer, trainloader, device):
        
        self.trainloader= trainloader
        self.device = device
        self.tokenizer = instantiate(cfg_tokenizer)
        self.world_model = WorldModel(obs_vocab_size=self.tokenizer.vocab_size, config=instantiate(cfg_worldmodel))
        self.agent = Agent(self.tokenizer, self.world_model).to(self.device)
        self.epochs_to_save = [84, 89, 94, 99, 104, 109]
        self.all_metrics = []
        print(f'{sum(p.numel() for p in self.agent.tokenizer.parameters())} parameters in agent.tokenizer')
        print(f'{sum(p.numel() for p in self.agent.world_model.parameters())} parameters in agent.world_model')

        self.optimizer_tokenizer = torch.optim.Adam(self.agent.tokenizer.parameters(), lr=0.0001)
        self.optimizer_world_model = configure_optimizer(self.agent.world_model, 0.0001, 0.01)
    
    def run(self) -> None:
        start_epoch=80
        end_epoch=110
        for epoch in range(start_epoch, end_epoch):
            #print(f"\nEpoch {epoch} / {self.cfg.common.epochs}\n")
            print(f"\nEpoch {epoch} / {80}\n")
            start_time = time.time()
            to_log = []
            to_log += self.train_agent(epoch)
            to_log.append({'duration': (time.time() - start_time) / 3600})
       

    def train_agent(self, epoch: int) -> None:
        self.agent.train()
        self.agent.zero_grad()

        metrics_worldmodel= {}

        #cfg_tokenizer = self.cfg.training.tokenizer
        #w = self.cfg.training.sampling_weights
        #if epoch > cfg_tokenizer.start_after_epochs:

        if epoch >= 0:
            
            print("Start Tokenizer training")
            metrics_worldmodel = self.train_component(self.agent.world_model, self.optimizer_world_model, self.trainloader, epoch)

        return [{'epoch': epoch, **metrics_tokenizer}]

    def train_component(self, component: nn.Module, optimizer: torch.optim.Optimizer, trainloader: DataLoader,epoch: int):
        loss_total_epoch = 0.0
        loss_total_epoch = 0.0
        intermediate_losses = defaultdict(float)
        optimizer.zero_grad()
        i=0
        for images in trainloader:
            input_image= images.to(device) 
            print(device)

            losses = component.compute_loss(input_image,tokenizer)
            loss_total_step += losses.loss_total 
            loss_total_epoch += loss_total_step.item()
            if (i+1) % 16 == 0:
                (loss_total_step/16).backward()
                optimizer.step()
                optimizer.zero_grad()
                print("Losses: Total = {:.4f}".format(loss_total_step.item()))
            
            for loss_name, loss_value in losses.intermediate_losses.items():
                intermediate_losses[f"{str(component)}/train/{loss_name}"] = loss_value/64
        
            
        metrics = {f'{str(component)}/train/total_loss': loss_total_epoch, **intermediate_losses}
        print("Epoch {}: Total Loss = {:.4f}".format(epoch, metrics[f'{str(component)}/train/total_loss']))

        if epoch in self.epochs_to_save:
            torch.save({
            'model_state_dict': component.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, '/space/zboucher/iris_1/src/checkpoint/transformer_{}'.format(epoch+1))


        metrics = {f'{str(component)}/train/total_loss': loss_total_epoch, **intermediate_losses}
        self.all_metrics.append(metrics)  # Save metrics for the current epoch to the list
        np.save('/space/zboucher/iris_1/src/checkpoint/all_metric.npy', self.all_metrics)
        return metrics




In [None]:
train_dataset=loaders['train']
trainer = Trans_Trainer(cfg_tokenizer,train_dataset, device)
trainer.run()

Tokenizer : shape of latent is (256, 16, 16).
torch.Size([2304])
positional torch.Size([768, 256])
torch.Size([4, 768, 256])
positional torch.Size([4, 1536, 256])
86853249 parameters in agent.tokenizer
5264384 parameters in agent.world_model

Epoch 80 / 80

Start Tokenizer training
cuda:1
torch.Size([4, 9, 1, 256, 256])


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same