<a href="https://colab.research.google.com/github/karthikg92/learn-rl/blob/main/2dnav.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import numpy as np                        # numpy functionality
from itertools import count
from collections import deque
import random
import copy

import torch
from torch.distributions import Normal
import torch.autograd as autograd         # computation graph
from torch import Tensor                  # tensor node in the computation graph
import torch.nn as nn                     # neural networks
import torch.nn.functional as F           # layers, activations and more
import torch.optim as optim               # optimizers e.g. gradient descent, ADAM, 


import matplotlib.pyplot as plt
import time

%matplotlib inline


In [32]:
"""
Environment for 2d navigation

Objective: move a robot in a 2d plane from a start location to a goal

State: 4 dimensional vector (x position, y position, goal x position, goal y position). The robot lives in [0, size]^2 world

Action: velocity vector (v_x, v_y) with fixed speed of max_speed

Rewards: -5 if out of bounds, 10 if it reaches the goal, -1 for all other cases
"""

class nav2d():

    def __init__(self, size=1):
        # The size of the square grid [0, self.size]^2
        self.size = size  

        # Initialize the state and the goal
        self.state = self.reset()

        # parameters
        self.max_speed= 0.05 # max robot speed
        self.goal_dist = 0.1 # threshold within which robot reaches goal


    def reset(self):

        # start and end location drawn uniformly at random
        # self.state = [
        #     np.random.rand() * self.size, # x
        #     np.random.rand() * self.size, # y
        #     np.random.rand() * self.size, # goal_x
        #     np.random.rand() * self.size  # goal_y
        # ]

        # deterministic start and end location
        self.state = [0.1, 0.1, 0.6, 0.8, np.sqrt((0.1 - 0.6)**2 + (0.1 - 0.8)**2 )]

        # state = [x_loc, y_loc, goal x_loc, goal y_loc, dist to goal]

        return self.state


    def step(self, action):
      """
      action is a list/ array of length 2
      action[0]:  x velocity
      action[1]:  y velocity
      """

      # robot kinematics
      self.state[0] += self.max_speed * action[0]
      self.state[1] += self.max_speed * action[1]

      # computing rewards basedon distance to goal
      dist2goal = np.sqrt( (self.state[0] - self.state[2]) **2 + (self.state[1] - self.state[3]) **2 )

      self.state[4] = dist2goal

      # default info
      info = {'reached_goal': False}

      if dist2goal < self.goal_dist:
        # robot within goal
        reward = 10
        done = 1
        info['reached_goal'] = True
      elif self.state[0] < 0 or self.state[0] > self.size or self.state[1] < 0 or self.state[1] > self.size:
        # robot out of bounds of the 2-d environment
        reward = -5
        done = 0
      else:
        # robot moving legally within the 2d world
        reward = -2 * dist2goal
        done = 0
      

      return self.state, reward, done, info

    # def render(self):
    #   plt.scatter(self.state[0], self.state[1], c='black')
    #   plt.scatter(self.state[2], self.state[3], c='green')
    #   plt.xlim([0, self.size])
    #   plt.ylim([0, self.size])



In [34]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class PolicyNetwork(nn.Module):
    def __init__(self, input=4, output=2):
        super(PolicyNetwork, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(input, 64)
        self.fc2 = nn.Linear(64, 256)
        self.fc_mu = nn.Linear(256, output)
        self.fc_std = nn.Linear(256, output)
        self.tanh = nn.Tanh()
        self.softplus = nn.Softplus()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        mu = 2 * self.tanh(self.fc_mu(x))
        std = self.softplus(self.fc_std(x)) + 1e-3
        return mu, std

    def select_action(self, state):
        with torch.no_grad():
            mu, std = self.forward(state)
            n = Normal(mu, std)
            action = n.sample()
            action = torch.clip(action, min=-1, max=1)
            action = action / torch.norm(action)
        action = action.tolist()[0]
        return action


class ValueNetwork(nn.Module):
    def __init__(self, input=4, output=1):
        super(ValueNetwork, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(input, 64)
        self.fc2 = nn.Linear(64, 256)
        self.fc3 = nn.Linear(256, output)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Memory(object):
    def __init__(self, memory_size: int) -> None:
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience) -> None:
        self.buffer.append(experience)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if continuous:
            rand = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(rand, rand + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()


env = nav2d()

state_dim = 5
action_dim = 2
policy = PolicyNetwork(input=state_dim, output=action_dim).to(device)
value = ValueNetwork(input=state_dim).to(device)
optim = torch.optim.Adam(policy.parameters(), lr=1e-5)
value_optim = torch.optim.Adam(value.parameters(), lr=3e-5)
gamma = 0.99
episode_length = 20
max_epochs = 1000000
epochs_per_batch = 1
batch_size = episode_length * epochs_per_batch
memory = Memory(batch_size)
steps = 0

saved_rewards = []
reached_goal = []

for epoch in range(max_epochs):

    state = env.reset()
    episode_reward = 0
  
    time_step = 0
    while True:

        time_step += 1

        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = policy.select_action(state_tensor)
        next_state, reward, done, info = env.step(action)
        episode_reward += reward

        memory.add((state, next_state, action, reward, done))

        state = next_state

        # dist to goal:
        #dist_to_goal = np.sqrt((state[0] - state[2])**2 +(state[1] - state[3])**2)


        if done:
          if info['reached_goal']:
            reached_goal.append(1)
          else:
            reached_goal.append(0)
          break

        if time_step == episode_length:
          reached_goal.append(0)
          break
        

    if epoch % epochs_per_batch == 0:

        experiences = memory.sample(batch_size)
        batch_state, batch_next_state, batch_action, batch_reward, batch_done = zip(*experiences)
        batch_state = torch.FloatTensor(batch_state).to(device)
        batch_next_state = torch.FloatTensor(batch_next_state).to(device)
        batch_action = torch.FloatTensor(batch_action).unsqueeze(1).to(device)
        batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(device)
        batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(device)

        with torch.no_grad():
            value_target = batch_reward + gamma * (1 - batch_done) * value(batch_next_state)
            advantage = value_target - value(batch_state)

        mu, std = policy(batch_state)
        n = Normal(mu, std)
        log_prob = n.log_prob(batch_action)
        loss = - log_prob * advantage
        loss = loss.mean()
        optim.zero_grad()
        loss.backward()
        optim.step()

        value_loss = F.mse_loss(value_target, value(batch_state))
        value_optim.zero_grad()
        value_loss.backward()
        value_optim.step()

        memory.clear()
      
        
    saved_rewards.append(episode_reward)
    if epoch % 1000 == 0:
      if len(saved_rewards)>1000:
        print('Epoch:{}, average episode rewards of last 1000 epochs is {}'.format(epoch, np.mean(saved_rewards[-1000:])))
        print("reached_goal out of last 1000 episodes: ", np.sum(reached_goal[-1000:]))

    #print('\n\n\n')


Epoch:1000, average episode rewards of last 1000 epochs is -46.9702374746413
reached_goal out of last 1000 episodes:  0
Epoch:2000, average episode rewards of last 1000 epochs is -38.19837844357277
reached_goal out of last 1000 episodes:  0
Epoch:3000, average episode rewards of last 1000 epochs is -37.263997375815016
reached_goal out of last 1000 episodes:  0
Epoch:4000, average episode rewards of last 1000 epochs is -33.91256131178801
reached_goal out of last 1000 episodes:  0
Epoch:5000, average episode rewards of last 1000 epochs is -31.979222308713446
reached_goal out of last 1000 episodes:  0
Epoch:6000, average episode rewards of last 1000 epochs is -31.836076101737333
reached_goal out of last 1000 episodes:  0
Epoch:7000, average episode rewards of last 1000 epochs is -32.32426883683571
reached_goal out of last 1000 episodes:  0


KeyboardInterrupt: ignored