In [1]:
# import library

import sys
import os
import d4rl
import gym
import numpy as np
import collections
import pickle
import csv

import torch
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from utils import D4RLTrajectoryDataset
from model import DecisionTransformer


No module named 'flow'
No module named 'carla'


In [2]:
# set environment
# sys.path.append(r'C:\Develop\offlineRL-with-diffusion') 

In [3]:
# test mujoco, d4rl

!python ./test/mujoco_test.py

mujoco-py check passed
d4rl check passed


No module named 'flow'
No module named 'carla'
pybullet build time: Apr 30 2024 12:01:25
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [4]:
# data download
# if you downloaded, don't re-start.

# !python ./data/download_d4rl_datasets.py


In [5]:
# parameter setting

env_name = 'halfcheetah'
dataset = 'medium'

if env_name == 'hopper':
    env = gym.make('Hopper-v2')
    max_ep_len = 1000
    # env_targets = [3600, 1800]  # evaluation conditioning targets
    # scale = 1000.  # normalization for rewards/returns
elif env_name == 'halfcheetah':
    env = gym.make('HalfCheetah-v2')
    max_ep_len = 1000
    # env_targets = [12000, 6000]
    # scale = 1000.
elif env_name == 'walker2d':
    env = gym.make('Walker2d-v2')
    max_ep_len = 1000
    # env_targets = [5000, 2500]
    # scale = 1000.

DATA_PATH = f'data/train/{env_name}-{dataset}-v2.pkl'
VAL_DATA_PATH = f'data/val/val_{env_name}-{dataset}-v2.pkl'
TEMP_DATA_PATH = f'data/temp/{env_name}-{dataset}-v2.pkl'
LOG_PATH = "./log/"
if torch.cuda.is_available():
    DEVICE = torch.device('cuda:0')
else:
    DEVICE = torch.device('cpu')

  logger.warn(


In [6]:
# env dataset check
check_env = gym.make('halfcheetah-medium-v2')
dataset = check_env.get_dataset()

# print(dataset['observations'][1]) # trajectory 단위로 뽑힘.


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile: 100%|██████████| 21/21 [00:02<00:00,  8.04it/s]


In [7]:
# print("overall len: ", dataset.shape)

In [8]:
print("state shape: ", dataset['observations'].shape)
print("action shape: ", dataset['actions'].shape)
print("reward shape: ", dataset['rewards'].shape)
print("N: ", dataset['rewards'].shape[0])
print("train_size: ", int(0.8 * dataset['rewards'].shape[0]))


state shape:  (1000000, 17)
action shape:  (1000000, 6)
reward shape:  (1000000,)
N:  1000000
train_size:  800000


In [9]:
# data check
# check three trajectories

with open(DATA_PATH, 'rb') as f:
    trajectories = pickle.load(f)
n=0
max_rewards_list = []
for traj in trajectories:
    # print(f"{n+1}번째 trajectory")
    # print("traj: ", traj)
    print("state: ", traj['observations'], "\n")
    # print("action: ", traj['actions'], "\n")
    # print("next_state: ", traj['next_observations'], "\n")
    # print("reward: ", traj['rewards'], "\n")
    # print("max_rewards: ", max(traj['rewards']))
    # max_rewards_list.append(max(traj['rewards']))
    # print("")
    n+=1
    
# print(max(max_rewards_list))

    if n==3:
        break

state:  [[ 1.9831914e-02 -8.9501314e-02 -3.1969063e-03 ...  1.1365079e-01
   6.8424918e-02 -1.3811582e-01]
 [-3.8486063e-03 -5.2394319e-02  8.3050327e-03 ...  4.5068407e+00
  -9.2885571e+00  4.7328596e+00]
 [-5.5298433e-02 -7.7850236e-05 -2.3952831e-01 ... -7.0811687e+00
  -1.4037068e+00  7.5524049e+00]
 ...
 [-3.1975684e-01  5.3305399e-01 -4.8704177e-01 ...  1.5455554e+00
   2.6812897e+00  8.7905388e+00]
 [-3.2200974e-01  3.5745117e-01  1.0463273e-02 ... -6.3428599e-01
   1.6292539e+00  9.7356015e-01]
 [-3.0673215e-01  1.9843711e-01  6.9996923e-01 ...  5.0098950e-01
   1.5680059e+00  9.4733723e-02]] 

state:  [[ 4.7026437e-02 -2.1588113e-02  4.9151547e-02 ...  5.5219561e-02
  -1.5351681e-01 -4.6239123e-02]
 [ 4.1392505e-02  5.3802542e-02 -1.5022255e-01 ...  6.1133021e-01
  -7.4645710e+00  7.9509692e+00]
 [ 9.8547200e-04  8.8533267e-02 -4.3876743e-01 ...  8.5824745e-04
   5.9796906e+00  4.9521341e+00]
 ...
 [-1.4081973e-01 -7.7957302e-02 -2.6429656e-01 ...  1.0316861e+00
  -7.5645506e-

In [10]:
# check original data shape
with open(TEMP_DATA_PATH, 'rb') as f:
    temp_trajectories = pickle.load(f)
    
print("length: ", len(temp_trajectories)*len(temp_trajectories[0]['observations']))
print("n of epi: ", len(temp_trajectories))
print("n of traj in one epi: ", len(temp_trajectories[0]['observations']))

length:  1000000
n of epi:  1000
n of traj in one epi:  1000


In [11]:
# check original dataset
cnt = 0


for ori in temp_trajectories:
    print(ori['observations'].shape)
    
    if cnt >= 5:
        break
    
    cnt+=1

(1000, 17)
(1000, 17)
(1000, 17)
(1000, 17)
(1000, 17)
(1000, 17)


In [12]:
states, next_states, rewards = [], [], []
for traj in temp_trajectories:
    # print(traj)
    traj_len = traj['observations'].shape[0]
    states.append(traj['observations'])
    next_states.append(traj['next_observations'])
    rewards.append(traj['rewards'])
    # # calculate returns to go and rescale them
    # traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale
    
states = np.concatenate(states, axis=0)
print("state shape: ", states.shape)

state shape:  (1000000, 17)


In [13]:
# check train data shape
with open(DATA_PATH, 'rb') as f:
    train_trajectories = pickle.load(f)

print("length: ", len(train_trajectories)*len(train_trajectories[0]['observations']))
print("n of epi: ", len(train_trajectories))
print("n of traj in one epi: ", len(train_trajectories[0]['observations']))
# print("train state shape: ", train_trajectories['observations'].shape)
# print("train action shape: ", train_trajectories['actions'].shape)
# print("train reward shape: ", train_trajectories['rewards'].shape)


length:  800000
n of epi:  800
n of traj in one epi:  1000


In [14]:
# check valid data shape
with open(VAL_DATA_PATH, 'rb') as f:
    val_trajectories = pickle.load(f)

print("length: ", len(val_trajectories)*len(val_trajectories[0]['observations']))
print("n of epi: ", len(val_trajectories))
print("n of traj in one epi: ", len(val_trajectories[0]['observations']))
# print("val state shape: ", val_trajectories['observations'].shape)
# print("val action shape: ", val_trajectories['actions'].shape)
# print("val reward shape: ", val_trajectories['rewards'].shape)

length:  200000
n of epi:  200
n of traj in one epi:  1000


In [15]:
# train parameter
batch_size = 64
embed_dim = 128
activation = 'relu'
drop_out = 0.1
k = 31 # content len
n_blocks = 3
n_heads = 1 # transformer head

# total updates = max_train_iters x num_updates_per_iter
max_train_iters = 400
num_updates_per_iter = 100
# num_val_iter = 100
total_updates = 0
min_total_loss = 1e10

wt_decay = 1e-4             # weight decay
lr = 1e-4                   # learning rate
warmup_steps = 10000        # warmup steps for lr scheduler

# weight of mse loss
state_weight = 1
reward_weight = 1

# evaluation parameter
# max_eval_ep_len = 1000      # max len of one evaluation episode
# num_eval_ep = 10            # num of evaluation episodes per iteration

In [16]:
# check dim

state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

print("state dim: ", state_dim)
print("action dim: ", act_dim)

state dim:  17
action dim:  6


In [17]:
# test data
temp_dataset = D4RLTrajectoryDataset(DATA_PATH, 2)
temp_data_loader = DataLoader(temp_dataset,
						batch_size=32,
						shuffle=True,
						pin_memory=True,
						drop_last=True)
                        
temp_data_iter = iter(temp_data_loader)

timesteps, states, next_states, actions, rewards, traj_mask = next(temp_data_iter)

timesteps = timesteps.to(DEVICE)	# B x T
states = states.to(DEVICE)			# B x T x state_dim
next_states = next_states.to(DEVICE) # B X T X state_dim
actions = actions.to(DEVICE)		# B x T x act_dim
rewards = rewards.to(DEVICE).unsqueeze(dim=-1) # B x T x 1

print("timesteps shape: ", timesteps.shape)
print("rewards shape: ", rewards.shape)
print("states shape: ", states.shape)
print("actions shape: ", actions.shape)

# print("state: ", states)
# print("action: ", actions)
# print("rewards: ", rewards)



timesteps shape:  torch.Size([32, 2])
rewards shape:  torch.Size([32, 2, 1])
states shape:  torch.Size([32, 2, 17])
actions shape:  torch.Size([32, 2, 6])


In [18]:
# test model

temp_model = DecisionTransformer(
			state_dim=state_dim,
			act_dim=act_dim,
			# reward 포함 + r0 제외
			n_blocks=n_blocks,
			h_dim=16,
			context_len=2,
			n_heads=n_heads,
			drop_p=drop_out,
		).to(DEVICE)
		
next_state_preds, rewards_preds = temp_model.forward(
												rewards=rewards,
												timesteps=timesteps,
												states=states,
												actions=actions,
											)

In [19]:
# continue train test
test_traj_dataset = D4RLTrajectoryDataset(DATA_PATH, k)
test_traj_data_loader = DataLoader(test_traj_dataset,
						batch_size=batch_size,
						shuffle=True,
						pin_memory=True,
						drop_last=True)
                        
test_data_iter = iter(test_traj_data_loader)


for i_train_iter in tqdm(range(max_train_iters)):
	
	for _ in range(num_updates_per_iter):
		try:
			timesteps, states, next_states, actions, rewards, traj_mask = next(test_data_iter)
		except StopIteration:
			test_traj_data_loader = DataLoader(test_traj_dataset,
						batch_size=batch_size,
						shuffle=True,
						pin_memory=True,
						drop_last=True)
						
			test_data_iter = iter(test_traj_data_loader)
			timesteps, states, next_states, actions, rewards, traj_mask = next(test_data_iter)

100%|██████████| 400/400 [01:34<00:00,  4.25it/s]


In [20]:
# load train preprocessing(normalization, fit padding) data

traj_dataset = D4RLTrajectoryDataset(DATA_PATH, k)
traj_data_loader = DataLoader(traj_dataset,
						batch_size=batch_size,
						shuffle=True,
						pin_memory=True,
						drop_last=True)
                        
data_iter = iter(traj_data_loader)

## get state stats from dataset
state_mean, state_std = traj_dataset.get_state_stats()

In [21]:
# load validate preprocessing(normalization, fit padding) data

val_traj_dataset = D4RLTrajectoryDataset(DATA_PATH, k, val=True, val_dataset_path=VAL_DATA_PATH)
val_traj_data_loader = DataLoader(val_traj_dataset,
						batch_size=batch_size,
						shuffle=True,
						pin_memory=True,
						drop_last=True)
                        


In [22]:
# define model

model = DecisionTransformer(
			state_dim=state_dim,
			act_dim=act_dim,
			n_blocks=n_blocks,
			h_dim=embed_dim,
			context_len=k,
			n_heads=n_heads,
			drop_p=drop_out,
		).to(DEVICE)
  
optimizer = torch.optim.AdamW(
					model.parameters(), 
					lr=lr, 
					weight_decay=wt_decay
				)

scheduler = torch.optim.lr_scheduler.LambdaLR(
		optimizer,
		lambda steps: min((steps+1)/warmup_steps, 1)
	)
	


In [23]:
start_time = datetime.now().replace(microsecond=0)

start_time_str = start_time.strftime("%y-%m-%d-%H-%M-%S")

prefix = "dt_" + env_name

save_model_name =  prefix + "_model_" + str(max_train_iters) + "_" + str(batch_size) + ".pt"
save_model_path = os.path.join(LOG_PATH, save_model_name)
save_best_model_path = save_model_path[:-3] + "_best.pt"

log_csv_name = prefix + "_log_" + start_time_str + ".csv"
log_csv_path = os.path.join(LOG_PATH, log_csv_name)


csv_writer = csv.writer(open(log_csv_path, 'a', 1))
csv_header = (["duration", "num_updates", "total_loss", "state_loss", "reward_loss", "val_total_loss", "val_state_loss", "val_reward_loss"])

csv_writer.writerow(csv_header)


print("=" * 60)
print("start time: " + start_time_str)
print("=" * 60)

print("device set to: " + str(DEVICE))
print("dataset path: " + DATA_PATH)
print("model save path: " + save_model_path)
print("log csv save path: " + log_csv_path)

# train
for i_train_iter in tqdm(range(max_train_iters)):


	log_state_losses, log_reward_losses, log_total_losses = [], [], []
	val_log_state_losses, val_log_reward_losses, val_log_total_losses = [], [], []
	model.train()
	
	for _ in range(num_updates_per_iter):
		try:
			timesteps, states, next_states, actions, rewards, traj_mask = next(data_iter)
		except StopIteration:
			traj_data_loader = DataLoader(traj_dataset,
						batch_size=batch_size,
						shuffle=True,
						pin_memory=True,
						drop_last=True)
			data_iter = iter(traj_data_loader)
			timesteps, states, next_states, actions, rewards, traj_mask = next(data_iter)

		timesteps = timesteps.to(DEVICE)	# B x T
		states = states.to(DEVICE)			# B x T x state_dim
		next_states = next_states.to(DEVICE) # B X T X state_dim
		actions = actions.to(DEVICE)		# B x T x act_dim
		rewards = rewards.to(DEVICE).unsqueeze(dim=-1) # B x T x 1
		traj_mask = traj_mask.to(DEVICE)	# B x T

		next_states_target = torch.clone(next_states).detach().to(DEVICE)
		rewards_target = torch.clone(rewards).detach().to(DEVICE)
	
		next_state_preds, rewards_preds = model.forward(
														timesteps=timesteps,
														states=states,
														actions=actions,
														rewards=rewards,
													)

		# only consider non padded elements
		next_state_preds = next_state_preds.view(-1, state_dim)[traj_mask.view(-1,) > 0]
		next_states_target = next_states_target.view(-1, state_dim)[traj_mask.view(-1,) > 0]
		
		rewards_preds = rewards_preds.view(-1, 1)[traj_mask.view(-1,) > 0]
		rewards_target = rewards_target.view(-1, 1)[traj_mask.view(-1,) > 0]

		state_loss = F.mse_loss(next_state_preds, next_states_target, reduction='mean') * state_weight
		reward_loss = F.mse_loss(rewards_preds, rewards_target, reduction='mean') * reward_weight
		
		total_loss = state_loss.add(reward_loss)
		total_loss = torch.mean(total_loss)

		optimizer.zero_grad()
		total_loss.backward()
		torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
		optimizer.step()
		scheduler.step()
		
		
		#save loss
		log_state_losses.append(state_loss.detach().cpu().item())
		log_reward_losses.append(reward_loss.detach().cpu().item())
		
		log_total_losses.append(total_loss.detach().cpu().item())
		
	# validation
	model.eval()
	for val_timesteps, val_states, val_next_states, val_actions, val_rewards, val_traj_mask in val_traj_data_loader:
		
		val_timesteps = val_timesteps.to(DEVICE)	# B x T
		val_states = val_states.to(DEVICE)			# B x T x state_dim
		val_next_states = val_next_states.to(DEVICE) # B X T X state_dim
		val_actions = val_actions.to(DEVICE)		# B x T x act_dim
		val_rewards = val_rewards.to(DEVICE).unsqueeze(dim=-1) # B x T x 1
		val_traj_mask = val_traj_mask.to(DEVICE)	# B x T
				
		val_next_states_target = torch.clone(val_next_states).detach().to(DEVICE)
		val_rewards_target = torch.clone(val_rewards).detach().to(DEVICE)
		
		val_next_state_preds, val_rewards_preds = model.forward(
														timesteps=val_timesteps,
														states=val_states,
														actions=val_actions,
														rewards=val_rewards,
													)
													
		# only consider non padded elements
		val_next_state_preds = val_next_state_preds.view(-1, state_dim)[traj_mask.view(-1,) > 0]
		val_next_states_target = val_next_states_target.view(-1, state_dim)[traj_mask.view(-1,) > 0]
		
		val_rewards_preds = val_rewards_preds.view(-1, 1)[traj_mask.view(-1,) > 0]
		val_rewards_target = val_rewards_target.view(-1, 1)[traj_mask.view(-1,) > 0]

		val_state_loss = F.mse_loss(val_next_state_preds, val_next_states_target, reduction='mean') * state_weight
		val_reward_loss = F.mse_loss(val_rewards_preds, val_rewards_target, reduction='mean') * reward_weight

		# todo: try to use mae
		
		val_total_loss = val_state_loss.add(val_reward_loss)
		val_total_loss = torch.mean(val_total_loss)
		
		# save val loss
		val_log_state_losses.append(val_state_loss.detach().cpu().item())
		val_log_reward_losses.append(val_reward_loss.detach().cpu().item())
		
		val_log_total_losses.append(val_total_loss.detach().cpu().item())
	
	mean_total_log_loss = np.mean(log_total_losses)
	mean_state_log_loss = np.mean(log_state_losses)
	mean_reward_log_loss = np.mean(log_reward_losses)
	
	mean_val_total_log_loss = np.mean(val_log_total_losses)
	mean_val_state_log_loss = np.mean(val_log_state_losses)
	mean_val_reward_log_loss = np.mean(val_log_reward_losses)

	time_elapsed = str(datetime.now().replace(microsecond=0) - start_time)

	total_updates += num_updates_per_iter

	log_str = ("=" * 60 + '\n' +
			"time elapsed: " + time_elapsed  + '\n' +
			"num of updates: " + str(total_updates) + '\n' +
			"train total loss: " + format(mean_total_log_loss, ".5f") + '\n' +
			"train state loss: " + format(mean_state_log_loss, ".5f") + '\n' +
			"train reward loss: " +  format(mean_reward_log_loss, ".5f") + '\n' +
			"val total loss: " + format(mean_val_total_log_loss, ".5f") + '\n' +
			"val state loss: " + format(mean_val_state_log_loss, ".5f") + '\n' +
			"val reward loss: " +  format(mean_val_reward_log_loss, ".5f")
			)

	print(log_str)

	log_data = [time_elapsed, total_updates, mean_total_log_loss, mean_state_log_loss, mean_reward_log_loss, \
		 mean_val_total_log_loss, mean_val_state_log_loss, mean_val_reward_log_loss]

	csv_writer.writerow(log_data)
	
	# save model
	if mean_val_total_log_loss <= min_total_loss:
		print("saving min loss model at: " + save_best_model_path)
		torch.save(model.state_dict(), save_best_model_path)
		min_total_log_loss = mean_val_total_log_loss

	print("saving current model at: " + save_model_path)
	torch.save(model.state_dict(), save_model_path)


print("=" * 60)
print("finished training!")
print("=" * 60)
end_time = datetime.now().replace(microsecond=0)
time_elapsed = str(end_time - start_time)
end_time_str = end_time.strftime("%y-%m-%d-%H-%M-%S")
print("started training at: " + start_time_str)
print("finished training at: " + end_time_str)
print("total training time: " + time_elapsed)
print("saved min loss model at: " + save_best_model_path)
print("saved last updated model at: " + save_model_path)
print("=" * 60)

start time: 24-05-19-18-07-32
device set to: cpu
dataset path: data/train/halfcheetah-medium-v2.pkl
model save path: ./log/dt_halfcheetah_model_400_64.pt
log csv save path: ./log/dt_halfcheetah_log_24-05-19-18-07-32.csv


  0%|          | 1/400 [00:27<3:05:17, 27.86s/it]

time elapsed: 0:00:28
num of updates: 100
train total loss: 2.42268
train state loss: 1.36661
train reward loss: 1.05607
val total loss: 2.56406
val state loss: 1.42128
val reward loss: 1.14278
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  0%|          | 2/400 [00:54<3:00:42, 27.24s/it]

time elapsed: 0:00:54
num of updates: 200
train total loss: 2.32456
train state loss: 1.32182
train reward loss: 1.00274
val total loss: 2.44359
val state loss: 1.34692
val reward loss: 1.09667
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  1%|          | 3/400 [01:21<2:59:26, 27.12s/it]

time elapsed: 0:01:21
num of updates: 300
train total loss: 2.18691
train state loss: 1.27678
train reward loss: 0.91014
val total loss: 2.16324
val state loss: 1.31858
val reward loss: 0.84466
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  1%|          | 4/400 [01:48<2:57:41, 26.92s/it]

time elapsed: 0:01:48
num of updates: 400
train total loss: 1.97501
train state loss: 1.19889
train reward loss: 0.77612
val total loss: 1.99127
val state loss: 1.18875
val reward loss: 0.80252
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  1%|▏         | 5/400 [02:17<3:01:46, 27.61s/it]

time elapsed: 0:02:17
num of updates: 500
train total loss: 1.81728
train state loss: 1.16591
train reward loss: 0.65138
val total loss: 1.71986
val state loss: 1.09280
val reward loss: 0.62707
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  2%|▏         | 6/400 [02:54<3:23:57, 31.06s/it]

time elapsed: 0:02:55
num of updates: 600
train total loss: 1.64821
train state loss: 1.11867
train reward loss: 0.52953
val total loss: 1.54799
val state loss: 1.10220
val reward loss: 0.44579
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  2%|▏         | 7/400 [03:26<3:25:09, 31.32s/it]

time elapsed: 0:03:26
num of updates: 700
train total loss: 1.50106
train state loss: 1.06689
train reward loss: 0.43416
val total loss: 1.48773
val state loss: 1.08354
val reward loss: 0.40419
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  2%|▏         | 8/400 [03:58<3:25:14, 31.41s/it]

time elapsed: 0:03:58
num of updates: 800
train total loss: 1.42158
train state loss: 1.03447
train reward loss: 0.38711
val total loss: 1.35423
val state loss: 1.01275
val reward loss: 0.34147
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  2%|▏         | 9/400 [04:31<3:27:39, 31.87s/it]

time elapsed: 0:04:31
num of updates: 900
train total loss: 1.35314
train state loss: 1.00105
train reward loss: 0.35209
val total loss: 1.29039
val state loss: 0.98157
val reward loss: 0.30882
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  2%|▎         | 10/400 [05:01<3:24:45, 31.50s/it]

time elapsed: 0:05:02
num of updates: 1000
train total loss: 1.30901
train state loss: 0.98494
train reward loss: 0.32407
val total loss: 1.24030
val state loss: 0.94753
val reward loss: 0.29277
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt


  3%|▎         | 11/400 [05:31<3:20:28, 30.92s/it]

time elapsed: 0:05:31
num of updates: 1100
train total loss: 1.26553
train state loss: 0.96569
train reward loss: 0.29984
val total loss: 1.28595
val state loss: 0.99871
val reward loss: 0.28723
saving min loss model at: ./log/dt_halfcheetah_model_400_64_best.pt
saving current model at: ./log/dt_halfcheetah_model_400_64.pt
