In [2]:
import numpy as np

import gym
import quanser_robots
from quanser_robots import GentlyTerminating
import time

ENV_NAME= "Qube-v0"
sampling_type="uniform"
env=GentlyTerminating(gym.make(ENV_NAME))

sampling_type="uniform"

print("Sampling env:")
print(ENV_NAME)

env=GentlyTerminating(gym.make(ENV_NAME))


states=[]
actions=[]
rewards=[]
next_states=[]

num_samples=15000
do_render=False

arange=(env.action_space.high-env.action_space.low)[0]


assert(env.action_space.low.shape==(1,))

def random_action(mu,sigma):
  if sampling_type=="uniform":
    a=np.random.uniform(env.action_space.low[0],env.action_space.high[0],size=(1,))
  elif sampling_type=="discrete":
    a=np.random.choice([env.action_space.low[0],0,env.action_space.high[0]]).reshape(-1)
  else:
    a=np.random.normal(mu,sigma,size=(1,))
    a=np.clip(a,env.action_space.low,env.action_space.high)
  return a

while len(states)<num_samples:
  s=env.reset()
  #sample initial action uniformly from the action space
  mu=np.random.uniform(env.action_space.low[0],env.action_space.high[0])
  #exponential sampling for sigma of our markov chain of random actions
  sigma=np.exp(np.random.uniform(0,np.log(arange*2)))
  done=False
  step=0
  while not done:
    if do_render:
        env.render()
        time.sleep(0.016)
    #do a step
    
    a=random_action(mu,sigma)
    
    s_,r,done,info=env.step(a)
    #record data
    if not done:
      states.append(s)
      actions.append(a)
      rewards.append(r)
      next_states.append(s_)
    
    
    #update our Markov chain
    mu=a[0]
    #update current state
    del s
    s=s_
    step+=1


states=np.array(states)
actions=np.array(actions)
rewards=np.array(rewards)
next_states=np.array(next_states)

print("Observations Min:")
print(np.min(states,axis=0,keepdims=True))
print("Observations Max:")
print(np.max(states,axis=0,keepdims=True))
print("Observations Mean:")
print(np.mean(states,axis=0,keepdims=True))
print("Observations Std:")
print(np.std(states,axis=0,keepdims=True))

print("Actions Min:")
print(np.min(actions,axis=0))
print("Actions Max:")
print(np.max(actions,axis=0))
print("Actions Mean:")
print(np.mean(actions,axis=0))
print("Actions Std:")
print(np.std(actions,axis=0))

print("Rewards Min:")
print(np.min(rewards))
print("Rewards Max:")
print(np.max(rewards))
print("Rewards Mean:")
print(np.mean(rewards))
print("Rewards Std:")
print(np.std(rewards))

print(states.shape)
print(actions.shape)
print(rewards.shape)
print(next_states.shape)

np.save(f"dynamics_samples/{ENV_NAME}_states.npy",states)
np.save(f"dynamics_samples/{ENV_NAME}_actions.npy",actions)
np.save(f"dynamics_samples/{ENV_NAME}_rewards.npy",rewards)
np.save(f"dynamics_samples/{ENV_NAME}_next_states.npy",next_states)

Sampling env:
Qube-v0
Observations Min:
[[ -2.460008  -3.141512 -30.       -40.      ]]
Observations Max:
[[ 2.836503  3.141543 30.       40.      ]]
Observations Mean:
[[-0.003227  0.068037  0.03961   0.120729]]
Observations Std:
[[ 1.187617  2.292686  7.602608 10.660522]]
Actions Min:
[-14.998947]
Actions Max:
[14.997537]
Actions Mean:
[-0.013937]
Actions Std:
[8.636296]
Rewards Min:
-0.36041084
Rewards Max:
-0.0016006415
Rewards Mean:
-0.073501684
Rewards Std:
0.03843061
(15469, 4)
(15469, 1)
(15469,)
(15469, 4)


In [3]:
import torch
import torch.nn.functional as F
import numpy as np
import torch.utils.data as Data
import torch.optim as optim
# from collections import OrderedDict


traing_size = int(num_samples*0.85)
validation_size = num_samples - traing_size

env_names={0:"Levitation-v0",1:"CartpoleSwingShort-v0",2:"Qube-v0",3:"Pendulum-v2"}
ENV_NAME=env_names[2]

states=np.load(f"dynamics_samples/{ENV_NAME}_states.npy")[:traing_size+validation_size]
actions=np.load(f"dynamics_samples/{ENV_NAME}_actions.npy")[:traing_size+validation_size]
rewards=np.load(f"dynamics_samples/{ENV_NAME}_rewards.npy")[:traing_size+validation_size].reshape(-1,1)
next_states=np.load(f"dynamics_samples/{ENV_NAME}_next_states.npy")[:traing_size+validation_size]

#----------------change the state---------------
states = np.concatenate([
    np.cos(states[:,0]).reshape(-1,1), np.sin(states[:,0]).reshape(-1,1), 
    np.cos(states[:,1]).reshape(-1,1), np.sin(states[:,1]).reshape(-1,1), 
    states[:,2].reshape(-1,1), states[:,3].reshape(-1,1)
], axis = 1)

next_states = np.concatenate([
    np.cos(next_states[:,0]).reshape(-1,1), np.sin(next_states[:,0]).reshape(-1,1), 
    np.cos(next_states[:,1]).reshape(-1,1), np.sin(next_states[:,1]).reshape(-1,1), 
    next_states[:,2].reshape(-1,1), next_states[:,3].reshape(-1,1)
], axis = 1)

#-----------Normalization
# Data Normalization
# st_mean = states.mean(axis=0)
# st_std = states.std(axis=0)
# a_mean = actions.mean()
# a_std = actions.std()
# r_mean = rewards.mean()
# r_std = states.std()

#Actually it's range
st_mean = states.min(axis=0)
st_std = np.max(states - st_mean, axis=0)
a_mean = actions.min(axis=0)
a_std = np.max(actions-a_mean)
r_mean = rewards.min(axis=0)# np.min(rewards)
r_std = np.max(rewards-r_mean)

states = (states-st_mean)/(st_std)
next_states = (next_states-st_mean)/st_std
rewards = (rewards-r_mean)/r_std
actions = (actions-a_mean)/a_std

np.savez("dynamics_models/Data_para.npz", st_mean=st_mean, st_std=st_std, r_mean= r_mean, r_std=r_std,
         a_mean=a_mean, a_std=a_std)

random_index = np.random.permutation(traing_size+validation_size)
traing_index = random_index[:traing_size]
testing_index = random_index[traing_size:traing_size+validation_size]

test_states, test_actions, test_rewards, test_next_states = \
     states[traing_index], actions[traing_index], \
    rewards[traing_index], next_states[traing_index]

states, actions, rewards, next_states = \
    states[traing_index], actions[traing_index], rewards[traing_index], next_states[traing_index]

In [4]:
# Train Reward Model

epochs = 64
batch_size = 64

inputs, outputs = np.concatenate((states,actions), axis = 1), rewards
inputs, outputs = torch.tensor(inputs), torch.tensor(outputs, dtype=torch.float64)
test_inputs, test_outputs = np.concatenate((test_states,test_actions), axis = 1), test_rewards
test_inputs, test_outputs = torch.tensor(test_inputs), torch.tensor(test_outputs, dtype=torch.float64)

reward_model = torch.nn.Sequential(
            torch.nn.Linear(7, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 30),
            torch.nn.ReLU(),
            torch.nn.Linear(30, 1)
).double()   # Attention the double here!!
print("NN Model")
print(reward_model)

dataset = Data.TensorDataset(inputs, outputs)
loader = Data.DataLoader(
    dataset=dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
) 

criterion = torch.nn.MSELoss()
optimizer = optim.RMSprop(reward_model.parameters(), lr=0.001, centered=True)
lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=(0.9))

for epoch in range(epochs):
    epoch_loss = 0.0
    lr_scheduler.step()
    for step, (batch_x, batch_y) in enumerate(loader):        
        batch_y_p = reward_model(batch_x)
        loss = criterion(batch_y_p, batch_y)
        
        epoch_loss += batch_y.shape[0] * loss.item()
        #Backward
        optimizer.zero_grad()
        loss.backward()    
        optimizer.step()
    
    vali_loss = criterion(reward_model(test_inputs), test_outputs).item()
    print("epoch: ", epoch+1, "traing_loss: %e"%(epoch_loss/traing_size), "validation_loss: %e"%vali_loss) 

NN Model
Sequential(
  (0): Linear(in_features=7, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=30, bias=True)
  (3): ReLU()
  (4): Linear(in_features=30, out_features=1, bias=True)
)
epoch:  1 traing_loss: 1.703752e-02 validation_loss: 3.944747e-03
epoch:  2 traing_loss: 2.665099e-03 validation_loss: 2.619633e-03
epoch:  3 traing_loss: 1.757165e-03 validation_loss: 1.362897e-03
epoch:  4 traing_loss: 1.281352e-03 validation_loss: 1.025940e-03
epoch:  5 traing_loss: 1.046962e-03 validation_loss: 1.614854e-03
epoch:  6 traing_loss: 9.087934e-04 validation_loss: 6.618214e-04
epoch:  7 traing_loss: 7.629573e-04 validation_loss: 6.885218e-04
epoch:  8 traing_loss: 6.365075e-04 validation_loss: 5.721254e-04
epoch:  9 traing_loss: 5.651330e-04 validation_loss: 4.133467e-04
epoch:  10 traing_loss: 4.942822e-04 validation_loss: 6.936902e-04
epoch:  11 traing_loss: 4.621497e-04 validation_loss: 3.502514e-04
epoch:  12 traing_loss: 3.964572e-04 validation_l

In [5]:
# Train State Model

epochs = 60
batch_size = 64

inputs, outputs = np.concatenate((states,actions), axis = 1), next_states
inputs, outputs = torch.tensor(inputs, dtype=torch.float64), torch.tensor(outputs, dtype=torch.float64)
test_inputs, test_outputs = np.concatenate((test_states,test_actions), axis = 1), test_next_states
test_inputs, test_outputs = torch.tensor(test_inputs, dtype=torch.float64), torch.tensor(test_outputs, dtype=torch.float64)

state_model = torch.nn.Sequential(
            torch.nn.Linear(7, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 30),
            torch.nn.ReLU(),
            torch.nn.Linear(30, 6)
).double()   # Attention the double here!!
print("NN Model")
print(state_model)

dataset = Data.TensorDataset(inputs, outputs)
loader = Data.DataLoader(
    dataset=dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
) 

criterion = torch.nn.MSELoss()
optimizer = optim.RMSprop(state_model.parameters(), lr=0.0001, centered=True)
lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=(0.9))

for epoch in range(epochs):
    lr_scheduler.step()
    epoch_loss = 0.0
    for step, (batch_x, batch_y) in enumerate(loader):        
        batch_y_p = state_model(batch_x)
        loss = criterion(batch_y_p, batch_y)
        
        epoch_loss += batch_y.shape[0] * loss.item()
        #Backward
        optimizer.zero_grad()
        loss.backward()    
        optimizer.step()
    
    vali_loss = criterion(state_model(test_inputs), test_outputs).item()
    print("epoch: ", epoch, "traing_loss: %e"%(epoch_loss/traing_size), "validation_loss: %e"%vali_loss)       

NN Model
Sequential(
  (0): Linear(in_features=7, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=30, bias=True)
  (3): ReLU()
  (4): Linear(in_features=30, out_features=6, bias=True)
)
epoch:  0 traing_loss: 1.287443e-01 validation_loss: 5.714211e-02
epoch:  1 traing_loss: 4.594796e-02 validation_loss: 3.438751e-02
epoch:  2 traing_loss: 2.415894e-02 validation_loss: 1.477940e-02
epoch:  3 traing_loss: 1.007754e-02 validation_loss: 6.742672e-03
epoch:  4 traing_loss: 5.263503e-03 validation_loss: 4.085829e-03
epoch:  5 traing_loss: 3.392670e-03 validation_loss: 2.799833e-03
epoch:  6 traing_loss: 2.402021e-03 validation_loss: 2.064345e-03
epoch:  7 traing_loss: 1.794971e-03 validation_loss: 1.572998e-03
epoch:  8 traing_loss: 1.392248e-03 validation_loss: 1.251383e-03
epoch:  9 traing_loss: 1.138925e-03 validation_loss: 1.055131e-03
epoch:  10 traing_loss: 9.829593e-04 validation_loss: 9.288090e-04
epoch:  11 traing_loss: 8.931297e-04 validation_lo

In [5]:
reward_model.eval()
state_model.eval()
# Save the model
torch.save(reward_model, f"dynamics_models/Pendulum_rewards.pth")
torch.save(state_model, f"dynamics_models/Pendulum_states.pth")