In [1]:
import gym
import simple_driving
#import pybullet_envs
import pybullet as p
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import math
from collections import defaultdict
import pickle
from IPython.display import clear_output
import torch
import random

#display = Display(visible=0, size=(400, 300))
#display.start()

def display_video(frames, framerate=30):
  """Generates video from `frames`.

  Args:
    frames (ndarray): Array of shape (n_frames, height, width, 3).
    framerate (int): Frame rate in units of Hz.

  Returns:
    Display object.
  """
  height, width, _ = frames[0].shape
  dpi = 70
  orig_backend = matplotlib.get_backend()
  matplotlib.use('Agg')  # Switch to headless 'Agg' to inhibit figure rendering.
  fig, ax = plt.subplots(1, 1, figsize=(width / dpi, height / dpi), dpi=dpi)
  matplotlib.use(orig_backend)  # Switch back to the original backend.
  ax.set_axis_off()
  ax.set_aspect('equal')
  ax.set_position([0, 0, 1, 1])
  im = ax.imshow(frames[0])
  def update(frame):
    im.set_data(frame)
    return [im]
  interval = 1000/framerate
  anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                  interval=interval, blit=True, repeat=False)
  return HTML(anim.to_html5_video())

In [2]:
def epsilon_greedy(env, state, episodes, episode, model):
    """Selects an action to take based on a uniformly random sampled number. 
    If this number is greater than epsilon then returns action with the largest
    Q-value at the current state. Otherwise it returns a random action. This
    version decays epsilon from a large number to a small number over the duration
    of training. This results in highly likely random actions at the start and
    eventually biasing actions towards those with high q-values towards the end.

    Args:
        env: gym object.
        state: current state
        episodes: maximum number of episodes
        episode: number of episodes played so far
        model: Q-function approximator

    Returns:
        Action to be executed for next step.
    """
    EPS_START = 0.99
    EPS_END = 0.15
    EPS_DECAY = episodes
    sample = np.random.uniform(0, 1)
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * episode / EPS_DECAY)
    if sample > eps_threshold:
        q_values = model(torch.tensor([state], dtype=torch.float32))    # predict q-values for state
        return q_values.argmax().item()                                 # return action with highest q-value
    else:
        return np.random.choice(np.array(range(8)))  # incorporate prior here to prevent flapping too much during exploration (agent will always keep trying to fly into the sky otherwise)
        #return np.random.choice([6,7,8])

def simulate(env, max_episode_length, episodes, episode, model):
    """Rolls out an episode of actions to be used for learning.

    Args:
        env: gym object.
        episodes: maximum number of episodes
        episode: number of episodes played so far

    Returns:
        Dataset of episodes for training the RL agent containing states, actions and rewards.
    """
    D = []
    state = env.reset()                                                   # line 2
    done = False
    env.getExtendedObservation()
    prev_reward = 0
    #while True:                                                             # here I chose to not let episode end but you could replace with the line below
    for step in range(max_episode_length):                                  # line 3
        action = epsilon_greedy(env, state, episodes, episode, model)  # line 4
        next_state, reward, done,info = env.step(action)                   # line 5

        D.append([state, action, reward, next_state])                       # line 7
        state = next_state                                                  # line 8
        if done:                                                            # if we fall into
            break
    return D                                                                # line 10

def approx_q_learning(env, gamma, episodes, max_episode_length, model, loss_fn, optimizer):
    """Main loop of Approximate Q-learning algorithm.

    Args:
        env: gym object.
        gamma: discount factor - determines how much to value future actions
        episodes: number of episodes to play out
        max_episode_length: maximum number of steps for episode roll out
        model: Q-function approximator
        loss_fn: the loss function for our function approximator
        optimizer: for backpropagating the gradient of our loss

    Returns:
        Q-function which is used to derive policy.
    """
    total_reward = 0
    for episode in range(episodes):                                             # slightly different to line 3, we just run until maximum episodes played out
        D = simulate(env, max_episode_length, episodes, episode, model)    # line 4
        q_values_batch = []
        target_batch = []
        for data in D:                                                          # data = [state, action, reward, next_state]  (line 5)
            ####################### update Q values (line 6-7) #########################
            state  = data[0]
            action = data[1]
            reward = data[2]
            #print(f"this is reward = {reward}")
            next_state = data[3]
            q_values = model( torch.tensor([state], dtype=torch.float32) )            # predict Q-value for current state
            q_values_next = model(torch.tensor([next_state], dtype=torch.float32))  # predict Q-value for next state
            target = q_values.clone().detach()
            target[0][action] = reward + gamma * q_values_next.max().item()         # loss between prediction and true Q-value we just found from interacting with the env
            total_reward += data[2]
            loss = loss_fn(q_values, target)                                        # compute loss
            optimizer.zero_grad()
            loss.backward()                                                         # backpropogate the loss (compute gradients)
            optimizer.step()                                                        # update model using gradients
            #########################################################################    
        if episode % 100 == 0:
            print("average total reward per episode batch since episode ", episode, ": ", total_reward/ float(100))
            total_reward = 0
    return model  # line 9

In [3]:
######################### renders image from third person perspective for validating policy ##############################
#env = gym.make("SimpleDriving-v0", apply_api_compatibility=True, renders=False, isDiscrete=True, render_mode='tp_camera') 
##########################################################################################################################

######################### renders image from onboard camera ###############################################################
# env = gym.make("SimpleDriving-v0", apply_api_compatibility=True, renders=False, isDiscrete=True, render_mode='fp_camera') 
##########################################################################################################################

######################### if running locally you can just render the environment in pybullet's GUI #######################
#env = gym.make("SimpleDriving-v0", apply_api_compatibility=True, renders=False, isDiscrete=True,render_mode='fp_camera') 
##########################################################################################################################

env = gym.make("SimpleDriving-v0", apply_api_compatibility=False, isDiscrete = True) 

state = env.reset()



# Neural network params
l1 = 4   # this is set as two as we are just inputting x and y , this environment has more available inputs                    
l2 = 32                         # hidden layer dimension (we choose this)
l3 = env.action_space.n                         # output layer dimension - same as action space
alpha = 0.0001                    # learning rate for updating the neural network weights

# assemble neural network based on above params 
model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(l2, l3)
)

loss_fn = torch.nn.MSELoss()                                # mean square error loss function
optimizer = torch.optim.Adam(model.parameters(), lr=alpha)  # optimizer for updating neural network weights


gamma = 0.4              # discount factor - determines how much to value future actions
episodes = 301           # number of episodes to play out
max_episode_length = 400

approx_q_learning(env, gamma, episodes, max_episode_length, model, loss_fn, optimizer)
env.close()

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  q_values = model( torch.tensor([state], dtype=torch.float32) )            # predict Q-value for current state


average total reward per episode batch since episode  0 :  0.6348607445165831
average total reward per episode batch since episode  100 :  -45.24560492335605
average total reward per episode batch since episode  200 :  -57.9082404780888
average total reward per episode batch since episode  300 :  -68.75795970895335


In [8]:
import time
env = gym.make("SimpleDriving-v0", apply_api_compatibility=True, renders=True, isDiscrete=True,render_mode = 'fp_camera')
state, info = env.reset()

for i in range(100):
    q_values = model(torch.tensor([state], dtype=torch.float32))  # predict q-values using learnt model
    action = q_values.argmax().item()                             # select action with highest predicted q-value
    state, reward, done,_, info = env.step(action)
    #time.sleep(1)
    print(reward)
    if done:
        break

env.close()

3.9811888194754665
3.2768865893445716
2.4640061204741817
1.589009623020785
0.6793648916151049
-0.24860983058794328
-0.9875846069721764
-1.731221800977117
-2.476097496127644
-3.220040110446491
-3.9616009932366785
-4.69975636208003
-5.057676832480474
-5.414583602965048
-5.770871905529714
-5.665376994730155
-5.421814823613974
-5.166858070041346
-4.906835996092503
-4.644954350969075
-4.383298071445009
-4.12317866431717
-4.020186298733366
-3.7930786672084564
-3.4845122936290074
-3.1239855439556115
-2.729509154070061
-2.317885108278509
-2.0827495969251193
-1.842904509525678
-1.5931262316280521
-1.3325577201890408
-1.0643247510441527
-0.7851550891256553
-0.4961121359431122
-0.19929447687518087
0.1034929732585248
0.41799876361223276
0.74593614058097
1.089169378684018
1.446922508329755
1.818962118026405
2.2053619284294217
2.6063528222850096
3.0222459111561424
3.453378695829977
3.5560113982685913
3.010723769483361
2.9339814121345413
2.9968500884227476
3.0686823643019405
3.132412741613148
3.18465