In [1]:
import numpy as np

import gym
import quanser_robots
from quanser_robots import GentlyTerminating
import time

#MagLev, CartPoleSwingUp, FurutaPend
#MagLev=Levitation-v0 ???
#CartPoleSwingUp=CartpoleSwingShort-v0
#FuturaPend=Qube-v0

env_names={0:"Levitation-v0",1:"CartpoleSwingShort-v0",2:"Qube-v0",3:"Pendulum-v2"}


ENV_NAME=env_names[3]
sampling_type="uniform"

print("Sampling env:")
print(ENV_NAME)

env=GentlyTerminating(gym.make(ENV_NAME))
print("Observation space:")
print(env.observation_space)
print("Low:")
print(env.observation_space.low)
print("High:")
print(env.observation_space.high)
print("Action space:")
print(env.action_space)
print("Low:")
print(env.action_space.low)
print("High:")
print(env.action_space.high)

states=[]
actions=[]
rewards=[]
next_states=[]

num_samples=50000
do_render=False

arange=(env.action_space.high-env.action_space.low)[0]
print("Action Range:")
print(arange)

assert(env.action_space.low.shape==(1,))

def random_action(mu,sigma):
  if sampling_type=="uniform":
    a=np.random.uniform(env.action_space.low[0],env.action_space.high[0],size=(1,))
  elif sampling_type=="discrete":
    a=np.random.choice([env.action_space.low[0],0,env.action_space.high[0]]).reshape(-1)
  else:
    a=np.random.normal(mu,sigma,size=(1,))
    a=np.clip(a,env.action_space.low,env.action_space.high)
  return a

while len(states)<num_samples:
  s=env.reset()
  #sample initial action uniformly from the action space
  mu=np.random.uniform(env.action_space.low[0],env.action_space.high[0])
  #exponential sampling for sigma of our markov chain of random actions
  sigma=np.exp(np.random.uniform(0,np.log(arange*4)))
  done=False
  step=0
  while not done:
    if do_render:
        env.render()
        time.sleep(0.016)
    #do a step
    if step>-1:
        a=random_action(mu,sigma)
    else:
        #Bring Pendulum to bottom stand still.
        a=np.clip(-1*s[-1:],env.action_space.low[0],env.action_space.high[0])
    s_,r,done,info=env.step(a)
    #record data
    if not done:
      states.append(s)
      actions.append(a)
      rewards.append(r)
      next_states.append(s_)
      
    #update our Markov chain
    mu=a[0]
    #update current state
    del s
    s=s_
    step+=1


states=np.array(states)
actions=np.array(actions)
rewards=np.array(rewards)
next_states=np.array(next_states)

print("Observations Min:")
print(np.min(states,axis=0,keepdims=True))
print("Observations Max:")
print(np.max(states,axis=0,keepdims=True))
print("Observations Mean:")
print(np.mean(states,axis=0,keepdims=True))
print("Observations Std:")
print(np.std(states,axis=0,keepdims=True))

print("Actions Min:")
print(np.min(actions,axis=0))
print("Actions Max:")
print(np.max(actions,axis=0))
print("Actions Mean:")
print(np.mean(actions,axis=0))
print("Actions Std:")
print(np.std(actions,axis=0))

print("Rewards Min:")
print(np.min(rewards))
print("Rewards Max:")
print(np.max(rewards))
print("Rewards Mean:")
print(np.mean(rewards))
print("Rewards Std:")
print(np.std(rewards))

print(states.shape)
print(actions.shape)
print(rewards.shape)
print(next_states.shape)

np.save(f"dynamics_samples/{ENV_NAME}_states.npy",states)
np.save(f"dynamics_samples/{ENV_NAME}_actions.npy",actions)
np.save(f"dynamics_samples/{ENV_NAME}_rewards.npy",rewards)
np.save(f"dynamics_samples/{ENV_NAME}_next_states.npy",next_states)


Sampling env:
Pendulum-v2
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Observation space:
Box(2,)
Low:
[-3.141593 -8.      ]
High:
[3.141593 8.      ]
Action space:
Box(1,)
Low:
[-2.]
High:
[2.]
Action Range:
4.0


  result = entry_point.load(False)


Observations Min:
[[-3.141579 -8.      ]]
Observations Max:
[[3.141583 8.      ]]
Observations Mean:
[[ 0.002675 -0.014893]]
Observations Std:
[[2.230247 3.484919]]
Actions Min:
[-1.999993]
Actions Max:
[1.999979]
Actions Mean:
[0.000722]
Actions Std:
[1.155866]
Rewards Min:
-16.26697384780663
Rewards Max:
-0.0010147630483370793
Rewards Mean:
-6.189834311790467
Rewards Std:
3.617299854074057
(50148, 2)
(50148, 1)
(50148,)
(50148, 2)


In [2]:
import torch
import torch.nn.functional as F
import numpy as np
import torch.utils.data as Data
import torch.optim as optim
# from collections import OrderedDict

traing_size = 8000
validation_size = 2000



env_names={0:"Levitation-v0",1:"CartpoleSwingShort-v0",2:"Qube-v0",3:"Pendulum-v2"}
ENV_NAME=env_names[3]

states=np.load(f"dynamics_samples/{ENV_NAME}_states.npy")[:traing_size+validation_size]
actions=np.load(f"dynamics_samples/{ENV_NAME}_actions.npy")[:traing_size+validation_size]
rewards=np.load(f"dynamics_samples/{ENV_NAME}_rewards.npy")[:traing_size+validation_size].reshape(-1,1)
next_states=np.load(f"dynamics_samples/{ENV_NAME}_next_states.npy")[:traing_size+validation_size]

#----------------change the state---------------
states = np.concatenate([
    np.sin(states[:,0]).reshape(-1,1), np.cos(states[:,0]).reshape(-1,1), 
    states[:,1].reshape(-1,1)
], axis = 1)

next_states = np.concatenate([
    np.sin(next_states[:,0]).reshape(-1,1), np.cos(next_states[:,0]).reshape(-1,1), 
    next_states[:,1].reshape(-1,1)
], axis = 1)

#-----------Normalization
# Data Normalization
# st_mean = states.mean(axis=0)
# st_std = states.std(axis=0)
# a_mean = actions.mean()
# a_std = actions.std()
# r_mean = rewards.mean()
# r_std = states.std()

#Actually it's range
st_mean = states.min(axis=0)
st_std = np.max(states - st_mean, axis=0)
a_mean = -2
a_std = 4
r_mean = -20# np.min(rewards)
r_std = 20 #np.max(rewards-r_mean)

states = (states-st_mean)/(st_std)
next_states = (next_states-st_mean)/st_std
rewards = (rewards-r_mean)/r_std
actions = (actions-a_mean)/a_std

np.savez("dynamics_models/Data_para.npz", st_mean=st_mean, st_std=st_std, r_mean= r_mean, r_std=r_std,
         a_mean=a_mean, a_std=a_std)

random_index = np.random.permutation(traing_size+validation_size)
traing_index = random_index[:traing_size]
testing_index = random_index[traing_size:traing_size+validation_size]

test_states, test_actions, test_rewards, test_next_states = \
     states[traing_index], actions[traing_index], \
    rewards[traing_index], next_states[traing_index]

states, actions, rewards, next_states = \
    states[traing_index], actions[traing_index], rewards[traing_index], next_states[traing_index]

In [3]:
# Train Reward Model

epochs = 40
batch_size = 32

inputs, outputs = np.concatenate((states,actions), axis = 1), rewards
inputs, outputs = torch.tensor(inputs), torch.tensor(outputs)
test_inputs, test_outputs = np.concatenate((test_states,test_actions), axis = 1), test_rewards
test_inputs, test_outputs = torch.tensor(test_inputs), torch.tensor(test_outputs)

reward_model = torch.nn.Sequential(
            torch.nn.Linear(4, 64),
            torch.nn.ReLU(),
#             torch.nn.Linear(64, 64),
#             torch.nn.ReLU(),
            torch.nn.Linear(64, 1)
).double()   # Attention the double here!!
print("NN Model")
print(reward_model)

dataset = Data.TensorDataset(inputs, outputs)
loader = Data.DataLoader(
    dataset=dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
) 

criterion = torch.nn.MSELoss()
optimizer = optim.RMSprop(reward_model.parameters(), lr=0.001, centered=True)
lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=(0.985))

for epoch in range(epochs):
    epoch_loss = 0.0
    lr_scheduler.step()
    for step, (batch_x, batch_y) in enumerate(loader):        
        batch_y_p = reward_model(batch_x)
        loss = criterion(batch_y_p, batch_y)
        
        epoch_loss += batch_y.shape[0] * loss.item()
        #Backward
        optimizer.zero_grad()
        loss.backward()    
        optimizer.step()
    
    vali_loss = criterion(reward_model(test_inputs), test_outputs).item()
    print("epoch: ", epoch+1, "traing_loss: %e"%(epoch_loss/traing_size), "validation_loss: %e"%vali_loss) 

NN Model
Sequential(
  (0): Linear(in_features=4, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=1, bias=True)
)
epoch:  1 traing_loss: 1.355860e-02 validation_loss: 2.466834e-03
epoch:  2 traing_loss: 1.380993e-03 validation_loss: 7.928850e-04
epoch:  3 traing_loss: 6.317037e-04 validation_loss: 4.072150e-04
epoch:  4 traing_loss: 4.238627e-04 validation_loss: 2.649629e-04
epoch:  5 traing_loss: 3.544210e-04 validation_loss: 2.669881e-04
epoch:  6 traing_loss: 3.164559e-04 validation_loss: 2.457882e-04
epoch:  7 traing_loss: 2.773187e-04 validation_loss: 7.848275e-04
epoch:  8 traing_loss: 2.584831e-04 validation_loss: 2.216999e-04
epoch:  9 traing_loss: 2.242447e-04 validation_loss: 1.303898e-04
epoch:  10 traing_loss: 2.140485e-04 validation_loss: 1.512313e-04
epoch:  11 traing_loss: 1.948346e-04 validation_loss: 1.134746e-04
epoch:  12 traing_loss: 1.816300e-04 validation_loss: 4.375205e-04
epoch:  13 traing_loss: 1.690918e-04 validation_loss: 

In [4]:
# Train State Model

epochs = 60
batch_size = 64

inputs, outputs = np.concatenate((states,actions), axis = 1), next_states
inputs, outputs = torch.tensor(inputs), torch.tensor(outputs)
test_inputs, test_outputs = np.concatenate((test_states,test_actions), axis = 1), test_next_states
test_inputs, test_outputs = torch.tensor(test_inputs), torch.tensor(test_outputs)

state_model = torch.nn.Sequential(
            torch.nn.Linear(4, 64),
            torch.nn.ReLU(),
#             torch.nn.Linear(64, 64),
#             torch.nn.ReLU(),
            torch.nn.Linear(64, 3)
).double()   # Attention the double here!!
print("NN Model")
print(state_model)

dataset = Data.TensorDataset(inputs, outputs)
loader = Data.DataLoader(
    dataset=dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
) 

criterion = torch.nn.MSELoss()
optimizer = optim.RMSprop(state_model.parameters(), lr=0.001, centered=True)
lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=(0.985))

for epoch in range(epochs):
    lr_scheduler.step()
    epoch_loss = 0.0
    for step, (batch_x, batch_y) in enumerate(loader):        
        batch_y_p = state_model(batch_x)
        loss = criterion(batch_y_p, batch_y)
        
        epoch_loss += batch_y.shape[0] * loss.item()
        #Backward
        optimizer.zero_grad()
        loss.backward()    
        optimizer.step()
    
    vali_loss = criterion(state_model(test_inputs), test_outputs).item()
    print("epoch: ", epoch, "traing_loss: %e"%(epoch_loss/traing_size), "validation_loss: %e"%vali_loss)       

NN Model
Sequential(
  (0): Linear(in_features=4, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=3, bias=True)
)
epoch:  0 traing_loss: 1.303230e-02 validation_loss: 1.611085e-03
epoch:  1 traing_loss: 1.211684e-03 validation_loss: 8.883705e-04
epoch:  2 traing_loss: 7.338384e-04 validation_loss: 5.277916e-04
epoch:  3 traing_loss: 4.679534e-04 validation_loss: 3.912612e-04
epoch:  4 traing_loss: 3.567985e-04 validation_loss: 2.512761e-04
epoch:  5 traing_loss: 2.921936e-04 validation_loss: 1.642805e-04
epoch:  6 traing_loss: 2.381221e-04 validation_loss: 2.210274e-04
epoch:  7 traing_loss: 2.029079e-04 validation_loss: 1.303139e-04
epoch:  8 traing_loss: 2.019543e-04 validation_loss: 1.632487e-04
epoch:  9 traing_loss: 1.716168e-04 validation_loss: 1.234586e-04
epoch:  10 traing_loss: 1.749630e-04 validation_loss: 7.781458e-05
epoch:  11 traing_loss: 1.594830e-04 validation_loss: 1.608269e-04
epoch:  12 traing_loss: 1.477561e-04 validation_loss: 2

In [5]:
reward_model.eval()
state_model.eval()
# Save the model
torch.save(reward_model, f"dynamics_models/Pendulum_rewards.pth")
torch.save(state_model, f"dynamics_models/Pendulum_states.pth")