In [33]:
import torch
import random
import torch.nn as nn
import os
import sys
import numpy as np
import argparse
import yaml
sys.path.append('../models/')
sys.path.append('../dataloader/')
from transformer_decoder import TransformerDecoder, PositionalEncoding
from dataloader import low_rank
%load_ext autoreload
%autoreload 2

In [26]:
class DataLoader:

    def __init__(self, 
                 seed,
                 dir_path, 
                 target_id,
                 time_range,
                 feature_dim,
                 seq_len,
                 cont_dim,
                 lowrank_approx = False,
                 sing_to_keep =55):

        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        self.feature_dim = feature_dim
        self.cont_dim = cont_dim
        self.target_id = target_id
        self.seq_len = seq_len
        self.data_init = np.float32(np.load(dir_path+'data.npy',allow_pickle=True))
        self.mask = np.load(dir_path+'mask.npy',allow_pickle=True).astype(bool)
        self.data_init[self.mask] = 0
        self.target_data = self.data_init[target_id] 
        red_data = np.delete(self.data_init,self.target_id,0)
        if lowrank_approx:
            red_data[:,:,:self.cont_dim] = low_rank(red_data[:,:,:self.cont_dim],sing_to_keep)
            #fraction adjust estimator
            data_min = np.amin(red_data.reshape(-1,self.feature_dim),0)[:self.cont_dim]
            data_max = np.amax(red_data.reshape(-1,self.feature_dim),0)[:self.cont_dim]
            self.data = np.insert(red_data,target_id,target_data,0)	
            self.data[:,:,:self.cont_dim] = (self.data[:,:,:self.cont_dim] - data_min)/(data_max - data_min)

        else:
            self.data_min = np.amin(red_data.reshape(-1,self.feature_dim),0)
            self.data_max = np.amax(red_data.reshape(-1,self.feature_dim),0)
            self.data = self.data_init
            self.data[:,:,:self.cont_dim] = (self.data[:,:,:self.cont_dim] - self.data_min)/(self.data_max - self.data_min)
        
        self.seqs = self.data.shape[0]
        self.time_range = time_range
        self.time_ids = np.arange(self.time_range)
    
    def get_batch(self,batch_size):
        
        seqs = torch.zeros(batch_size,self.seq_len,self.feature_dim)
    
        for i in range(batch_size):
            
            seq_id = np.random.randint(self.seqs)
            interv_time = np.random.randint(self.seq_len, self.time_range)
            seq = self.data[seq_id,interv_time - self.seq_len:interv_time]
            seqs[i] = torch.from_numpy(seq)

        seqs = seqs.to(dtype=torch.float32)
        return seqs


In [27]:
#Setup dataloader
seed  = 0
dir_path  = '../datasets/synthetic_data_N_11_1/' 
target_id = 0
time_range = 2000
feature_dim = 2
seq_len  = 256
cont_dim = 2
inp_feature = 2
num_blocks = 3
d_model = 32
num_heads = 1
lr = 1e-4
weight_decay = 1e-4
device = torch.device('cuda:0' if torch.cuda.is_available else "cpu")



In [28]:
def train_model(model, dataloader, batch_size):
    
    warmup_steps = 5000
    optimizer = torch.optim.AdamW(model.parameters(),lr = lr, weight_decay = weight_decay)
    scheduler = torch.optim.lr_scheduler.LambdaLR(
                    optimizer,
                    lambda steps: min((steps+1)/warmup_steps,1))
    iters = 20000
    criterion = nn.MSELoss()
    
    for i in range(iters):
        
        
        optimizer.zero_grad()
        seq = dataloader.get_batch(batch_size)
        seq = seq.to(device)
        pred = model(seq[:,:-1])
        target = seq[:,1:].detach()
        loss = criterion(pred,target)
        loss.backward()
        optimizer.step()
        if i%5000 == 0:
            print(f'Iters: {i}',loss.item())
        if scheduler is not None:
            scheduler.step()
    
    return model

        
    

In [35]:
#training for syntehtic+noise 

ids = [1,2,3,4]
noise = [0.5,1,2]
for n in noise:
    for id in ids:
        op_dir = f'../logs_dir/txf_baseline/noise_{n}_{id}/'
        if not os.path.exists(op_dir):
            os.makedirs(op_dir)
        
        datapath = f'../datasets/synthetic_noise{n}_{id}/'
        dataloader = DataLoader(seed,
                 datapath, 
                 target_id,
                 time_range,
                 feature_dim,
                 seq_len,
                 cont_dim,
                lowrank_approx=False,
                    )
        model = TransformerDecoder(inp_feature, num_blocks, seq_len, d_model, num_heads)
        model = model.to(device)
        
        print(f'Training for noise {n} and id {id}')
        model = train_model(model, dataloader, 64)
        #save model
        torch.save(model.state_dict(),op_dir+'model.pt')
        #generating

        interv_time = 1600
        target_data = dataloader.target_data
        data_min = dataloader.data_min
        data_max = dataloader.data_max
        target_data = (target_data - data_min )/(data_max - data_min)
        target_data = torch.from_numpy(target_data).unsqueeze(0)
        target_data = target_data.to(device)
        op = model.generate(target_data[:,:interv_time],2000)
        op = op.cpu().numpy()
        mean = np.load(datapath+'mean1.npy')
        test_mean = mean[0,1600:]
        op = op*(data_max - data_min) + data_min
        pred_mean = np.squeeze(op[:,interv_time:,0])
        error_pred = np.sqrt(np.mean((pred_mean - test_mean )**2))
        print(f'RMSE for noise {n} and {id} is {error_pred}')

Training for noise 0.5 and id 1
Iters: 0 1.9597982168197632
Iters: 5000 0.013714583590626717
Iters: 10000 0.0054515814408659935
Iters: 15000 0.0038313153199851513
RMSE for noise 0.5 and 1 is 7.66731422052138
Training for noise 0.5 and id 2
Iters: 0 1.9669302701950073
Iters: 5000 0.013386721722781658
Iters: 10000 0.005320874508470297
Iters: 15000 0.0036650991532951593
RMSE for noise 0.5 and 2 is 7.582814516779606
Training for noise 0.5 and id 3
Iters: 0 1.9618945121765137
Iters: 5000 0.013463648967444897
Iters: 10000 0.005284096579998732
Iters: 15000 0.0037042638286948204
RMSE for noise 0.5 and 3 is 7.646450764781434
Training for noise 0.5 and id 4
Iters: 0 1.9768751859664917
Iters: 5000 0.013210228644311428
Iters: 10000 0.005325885023921728
Iters: 15000 0.003645914141088724
RMSE for noise 0.5 and 4 is 7.257323241551812
Training for noise 1 and id 1
Iters: 0 1.9539872407913208
Iters: 5000 0.012496361508965492
Iters: 10000 0.008739481680095196
Iters: 15000 0.006585079710930586
RMSE for n

In [36]:
#training for syntehtic+donors 

ids = [1,2,3,4]
donors = [6,11,16,21]
for n in donors:
    for id in ids:
        op_dir = f'../logs_dir/txf_baseline/donors_{n}_{id}/'
        if not os.path.exists(op_dir):
            os.makedirs(op_dir)
        
        datapath = f'../datasets/synthetic_data_N_{n}_{id}/'
        dataloader = DataLoader(seed,
                 datapath, 
                 target_id,
                 time_range,
                 feature_dim,
                 seq_len,
                 cont_dim,
                lowrank_approx=False,
                    )
        model = TransformerDecoder(inp_feature, num_blocks, seq_len, d_model, num_heads)
        model = model.to(device)
        
        print(f'Training for donors {n-1} and id {id}')
        model = train_model(model, dataloader, 64)
        #save model
        torch.save(model.state_dict(),op_dir+'model.pt')
        #generating

        interv_time = 1600
        target_data = dataloader.target_data
        data_min = dataloader.data_min
        data_max = dataloader.data_max
        target_data = (target_data - data_min )/(data_max - data_min)
        target_data = torch.from_numpy(target_data).unsqueeze(0)
        target_data = target_data.to(device)
        op = model.generate(target_data[:,:interv_time],2000)
        op = op.cpu().numpy()
        mean = np.load(datapath+'mean1.npy')
        test_mean = mean[0,1600:]
        op = op*(data_max - data_min) + data_min
        pred_mean = np.squeeze(op[:,interv_time:,0])
        error_pred = np.sqrt(np.mean((pred_mean - test_mean )**2))
        print(f'RMSE for donors {n-1} and {id} is {error_pred}')
                

Training for donors 5 and id 1
Iters: 0 1.9469525814056396
Iters: 5000 0.013315978460013866
Iters: 10000 0.009211961179971695
Iters: 15000 0.007027772720903158
RMSE for donors 5 and 1 is 5.151158332824707
Training for donors 5 and id 2
Iters: 0 1.9584866762161255
Iters: 5000 0.013258269056677818
Iters: 10000 0.009417972527444363
Iters: 15000 0.007134291809052229
RMSE for donors 5 and 2 is 5.274689197540283
Training for donors 5 and id 3
Iters: 0 1.9573441743850708
Iters: 5000 0.012900927104055882
Iters: 10000 0.008938752114772797
Iters: 15000 0.006723702419549227
RMSE for donors 5 and 3 is 5.455399036407471


FileNotFoundError: [Errno 2] No such file or directory: '../datasets/synthetic_data_N_6_4/data.npy'