In [1]:
# Model-Based RL - Using a model-based random shooting algorithm to solve HalfCheetah

# Code was used from: https://github.com/ZhengXinyue/Model-Predictive-Control/blob/master/Learning_MPC/Learning_MPC_Pendulum.py

### SANITY TEST ##############################################################

# ENVIRONMENT SUMMARY: -> Reacher-v2
# -> Continuous action space

# Observation -> 11 items
#    cos(theta) (first joint)        
#    cos(theta) (second joint)        
#    sin(theta) (first joint)        
#    sin(theta) (second joint)        
#    qpos (the x coordinate of the target )        
#    qpos (the y coordinate of the target )        
#    qvel (the velocity of the fingertip in the x direction )        
#    qvel (the velocity of the fingertip in the y direction )        
#    the x-axis component of the vector from the target to the fingertip        
#    the y-axis component of the vector from the target to the fingertip        
#    the z-axis component of the vector from the target to the fingertip    

# Action -> 2 items
# range from -1 to 1
# most likely correspond to torque of two joints

# Reward -> 1 item

### MAIN #######################################################################

# ENVIRONMENT SUMMARY: -> Cheetah-v2
# -> Continuous action space

# Observation -> 17 items

# Action -> 6 items

# Reward -> 1 item

#################################################################################

import os
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.optimizers import Adam
import pickle

# NOTES ##########################################################################

# the principle is that: more rollouts and longer horizon will make better performance.



##################################################################################


# Neural Network Design
def OurModel(input_shape, action_space):
    X_input = Input(input_shape)
    X = X_input    
    
    X = Dense(250, activation='relu')(X)
    X = Dense(250, activation='relu')(X)
    X = Dense(action_space)(X)
    
    model = Model(inputs = X_input, outputs = X)
    model.compile(optimizer=Adam(lr=0.001), loss='mse')

    return model

# Action Agent
class Agent:
    def __init__(self, env_name):
        
        # environmental variables 
        self.env_name = env_name       
        self.env = gym.make(env_name)
        # self.env.seed(0)  
        self.action_size = self.env.action_space.shape[0] 
        self.state_size = self.env.observation_space.shape[0]
        
        # takes action and previous state as input
        self.input_size = self.action_size + self.state_size
        self.output_size = self.state_size
        self.render = True
        
        # global variables
        self.rollout = 5
        self.horizon = 5        
        
        # training variables
        self.train_episodes = 10  
        self.batch_size = 10_000        
        
        # test variables
        self.test_episodes = 10  
        
        # initialise memory
        replay_length = 1_000_000
        self.memory = deque(maxlen=replay_length)
        self.memory_status = False
        
        # model
        self.system_model = OurModel(self.input_size, self.output_size)
    
    # decide action from state
    def act(self, state): 
        
        best_action = None
        max_trajectory_value = -float('inf')
        
        for trajectory in range(self.rollout):
            trajectory_value = 0
            
            for h in range(self.horizon):
                
                # select a random action
                action = np.array([random.uniform(-1, 1), random.uniform(-1, 1)], dtype=np.float32) 
                if h == 0:
                    first_action = action 
                                        
                # predict the next state            
                combined_state = np.expand_dims(np.concatenate((state, action)), axis=0) 
                next_state = (self.system_model(combined_state).numpy()).reshape(self.output_size,)
                
                # compute the reward                
                vec = next_state[8:]
                reward_dist = - np.linalg.norm(vec)
                reward_ctrl = - np.square(action).sum()
                costs = reward_dist + reward_ctrl
                
                # update trajectory
                trajectory_value += -costs
                state = next_state
                
            # check if this trajectory's value is higher.
            if trajectory_value > max_trajectory_value:
                max_trajectory_value = trajectory_value
                best_action = first_action
                
        return best_action
            
    
    # reset the state
    def reset(self):  
        return self.env.reset()        
    
    # accessing the replay buffer
    def replay(self):        
        pass
    
    # load the model weights
    def load(self, name):
        self.model.load_weights(name) 
    
    # save the model weights
    def save(self, name):
        self.model.save_weights(name)  
    
    # update the state and reward
    def step(self, action):
        next_state, reward, done, info  = self.env.step(action)        
        return next_state, reward, done, info
    
    # test the agent    
    def test(self):
        
        for e in range(1, self.test_episodes + 1):
            
            print('Starting Episode: ', e)
            state = self.reset()
            done = False
            episode_reward = 0
            frame = 0  
        
            while not done:
                
                # render the environment                
                if self.render:
                    self.env.render()
                              
                action = self.act(state) 
                next_state, reward, done, _ = self.step(action)
                episode_reward += reward
                frame += 1
                state = next_state

                if done: 
                    print('Episode {} Score: {}'.format(e, score))
                    pass    
                
        self.env.close()
        
    # train the agent    
    def train(self):
        
        for e in range(1, self.train_episodes + 1):
            
            print('Starting Episode: ', e)
            state = self.reset()
            done = False
            episode_reward = 0
            frame = 0  
        
            while not done:
                
                # render the environment                
                if self.render:
                    self.env.render()
                    
                """
                TO DO:
                
                Add to this section. 
                Code is not exactly as you would like it. 
                Needs to be on policy once buffer is filled  
                Perform a random action and then apply .act
                
                
                """
                              
                action = self.act(state) 
                next_state, reward, done, _ = self.step(action)
                episode_reward += reward
                frame += 1
                state = next_state

                if done: 
                    print('Episode {} Score: {}'.format(e, score))
                    pass    
                
        self.env.close()
        

if __name__ == "__main__":
    env_name = 'Reacher-v2'
    agent = Agent(env_name)
    agent.train()

Starting Episode:  1
Creating window glfw
Episode 1 Score: -55.453989757140874
Starting Episode:  2
Creating window glfw
Episode 2 Score: -55.92492618206044
Starting Episode:  3
Creating window glfw
Episode 3 Score: -53.1478078894905
Starting Episode:  4
Creating window glfw
Episode 4 Score: -54.95081056182205
Starting Episode:  5
Creating window glfw
Episode 5 Score: -53.47750105471826
Starting Episode:  6
Creating window glfw
Episode 6 Score: -47.5989546910159
Starting Episode:  7
Creating window glfw
Episode 7 Score: -58.52276361415398
Starting Episode:  8
Creating window glfw
Episode 8 Score: -50.27988602348038
Starting Episode:  9
Creating window glfw
Episode 9 Score: -48.402605400019475
Starting Episode:  10
Creating window glfw
Episode 10 Score: -55.55149981334971
