In [94]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tqdm
from replay_memory import ReplayMemory
import random
import pdb

## Policy Network
* We have 7 ray angles per ray scan. Each ray angle contributes a length 5 sublist containing [hit_block, hit_goal, hit_wall, hit_anything, distance_if_hit]
* We have 2 ray scans, so the agent observes a total of 70 data elements every timestep
* Vector observation space (size 70)
* Action space (size 7): 0, 1, ..., 6

Pytorch backpropagation notes:
`loss.backward()` computes `dloss/dx` for every parameter `x` which has `requires_grad=True`, and *only* computes gradients! These are accumulated into `x.grad` for every parameter `x`. In pseudo-code:

```
x.grad += dloss/dx
```
`optimizer.step` updates the value of `x` using the gradient `x.grad`. For example, the SGD optimizer performs:

```
x += -lr * x.grad
```
`optimizer.zero_grad()` clears `x.grad` for every parameter `x` in the optimizer. It’s important to call this before `loss.backward()`, otherwise you’ll accumulate the gradients from multiple passes.

In [50]:
class DQN(nn.Module):
    
    def __init__(self, vector_size, action_size):
        super().__init__()
        self.fc1 = nn.Linear(in_features=vector_size, out_features=20)
        self.fc2 = nn.Linear(in_features=20, out_features=10)
        self.out = nn.Linear(in_features=10, out_features=action_size)
    
    def forward(self, t):
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = self.out(t)
        return t

In [37]:
policynet = DQN(10, 4)

In [62]:

# zero out gradient buffers
policynet.zero_grad()

In [73]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x129e4f0d0>
<AddBackward0 object at 0x12a0106d0>
<SqueezeBackward3 object at 0x129e4f0d0>


In [89]:
# example gradient buffers
print(policynet.fc1.bias.grad)
tensor = torch.zeros((10,))
out = policynet(tensor)

# compute loss wrt target
criterion = nn.MSELoss()
target = torch.ones((4,))
loss = criterion(out, target)

# perform gradient descent
loss.backward()
print(policynet.fc1.bias.grad)

tensor([ 0.0391,  0.1244, -0.4158,  0.0000,  0.0000,  0.0000,  0.2490,  0.2530,
        -0.2754,  0.3704,  0.0065,  0.0000, -0.0622, -0.0616,  0.0000,  0.3700,
        -0.0037,  0.0000, -0.2078,  0.0000])
tensor([ 0.0469,  0.1493, -0.4989,  0.0000,  0.0000,  0.0000,  0.2988,  0.3036,
        -0.3304,  0.4444,  0.0078,  0.0000, -0.0747, -0.0739,  0.0000,  0.4440,
        -0.0045,  0.0000, -0.2494,  0.0000])


## Hyperparameters

In [93]:
num_episodes = 10
max_timesteps = 10
discount = 0.99
exploration_rate = 0.3

## Training

In [48]:
rewards_all_episodes = []

    rewards_current_episode = 0
    info = env.step()["PushBlock"]
    for _ in range(max_timestep):
        reward = info.rewards[agent]
        rewards_current_episode += reward
        state = info.vector_observations[agent]
        agent.give_feedback(reward, state)
        action = agent.get_action(state)
        info = env.step(action)["PushBlock"]
    rewards_all_episodes.append(rewards_current_episode)
    env.reset(train_mode=True)

NameError: name 'tqdm' is not defined

In [None]:
env = UnityEnvironment(file_name="environment-binaries/PushBlock")

In [95]:
actions = [0, 1, 2, 3, 4, 5, 6]

In [96]:
dqn = DQN(70, 7)

In [None]:
for _ in tqdm.tqdm(range(num_episodes)):
    # initialize start state
    info = env.step()
    for _ in range(max_timesteps):
        # select an action (epsilon greedy)
        if random.random() < exploration_rate:
            return random.choice(actions)
        else:
            output = dqn()
            return torch.argmax()