In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tqdm
from replay_memory import ReplayMemory
import random
import pdb
import math
from mlagents.envs.environment import UnityEnvironment

ModuleNotFoundError: No module named 'torch'

## Policy Network
* We have 7 ray angles per ray scan. Each ray angle contributes a length 5 sublist containing [hit_block, hit_goal, hit_wall, hit_anything, distance_if_hit]
* We have 2 ray scans, so the agent observes a total of 70 data elements every timestep
* Vector observation space (size 70)
* Action space (size 7): 0, 1, ..., 6

Pytorch backpropagation notes:
`loss.backward()` computes `dloss/dx` for every parameter `x` which has `requires_grad=True`, and *only* computes gradients! These are accumulated into `x.grad` for every parameter `x`. In pseudo-code:

```
x.grad += dloss/dx
```
`optimizer.step` updates the value of `x` using the gradient `x.grad`. For example, the SGD optimizer performs:

```
x += -lr * x.grad
```
`optimizer.zero_grad()` clears `x.grad` for every parameter `x` in the optimizer. It’s important to call this before `loss.backward()`, otherwise you’ll accumulate the gradients from multiple passes.

In [2]:
class DQN(nn.Module):
    
    def __init__(self, vector_size, action_size):
        super().__init__()
        self.fc1 = nn.Linear(in_features=vector_size, out_features=20)
        self.fc2 = nn.Linear(in_features=20, out_features=10)
        self.out = nn.Linear(in_features=10, out_features=action_size)
    
    def forward(self, t):
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = self.out(t)
        return t

In [None]:
policynet = DQN(10, 4)

In [None]:

# zero out gradient buffers
policynet.zero_grad()

In [None]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

In [None]:
# example gradient buffers
print(policynet.fc1.bias.grad)
tensor = torch.zeros((10,))
out = policynet(tensor)

# compute loss wrt target
criterion = nn.MSELoss()
target = torch.ones((4,))
loss = criterion(out, target)

# perform gradient descent
loss.backward()
print(policynet.fc1.bias.grad)

## Hyperparameters

In [3]:
num_episodes = 10
max_timesteps = 10
discount = 0.99
exploration_rate = 0.3

## Training

In [39]:
def extract_state(brain_info, brain_name):
    """Extract state from BrainInfo object."""
    s = brain_info[brain_name].vector_observations[0]
    s = torch.from_numpy(s)
    s.reshape((210,))
    s = s.float()
    return s

def extract_reward(brain_info, brain_name):
    """Extract reward from BrainInfo object."""
    return brain_info[brain_name].rewards[0]

In [28]:
env = UnityEnvironment(file_name="environment-binaries/PushBlock.app")
brain_name = "PushBlock"

INFO:mlagents.envs:
'Academy (1)' started successfully!
Unity Academy name: Academy (1)
        Number of Training Brains : 0
        Reset Parameters :
		static_friction -> 0.0
		dynamic_friction -> 0.0
		block_drag -> 0.5
		block_scale -> 2.0



In [29]:
actions = [0, 1, 2, 3, 4, 5, 6]

In [30]:
dqn = DQN(210, 7)

In [40]:
episode_loss = []
for _ in tqdm.tqdm(range(num_episodes)):
    # initialize start state
    info = env.step()
    pdb.set_trace()
    timestep_loss[]
    for _ in range(max_timesteps):
        # select an action (epsilon greedy)
        if random.random() < exploration_rate:
            action = random.choice(actions)
        else:
            state = extract_state(info, brain_name)
            output = dqn(state)
            action = torch.argmax(output).item()
        # execute selected action
        info = env.step(action)
        # calculate loss for the state-action pair
        prediction = dqn(state)
        next_state = extract_state(info, brain_name)
        reward = extract_reward(info, brain_name)
        next_qs = dqn(next_state)
        target = reward + max(next_qs)
        # log loss for state-action pair
        loss = math.fabs(target - prediction[action])
        timestep_loss.append(loss)
        # gradient descent update weights in policy network
    # log average loss for episode
    avg_loss = sum(timestep_loss) / len(timestep_loss)
    episode_loss.append(avg_loss)
            












  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

> <ipython-input-40-8519ed924d74>(5)<module>()
-> for _ in range(max_timesteps):


(Pdb)  n


> <ipython-input-40-8519ed924d74>(7)<module>()
-> if random.random() < exploration_rate:


(Pdb)  


> <ipython-input-40-8519ed924d74>(8)<module>()
-> action = random.choice(actions)


(Pdb)  


> <ipython-input-40-8519ed924d74>(14)<module>()
-> info = env.step(action)


(Pdb)  


> <ipython-input-40-8519ed924d74>(16)<module>()
-> prediction = dqn(state)


(Pdb)  


> <ipython-input-40-8519ed924d74>(17)<module>()
-> next_state = extract_state(info, brain_name)


(Pdb)  


> <ipython-input-40-8519ed924d74>(18)<module>()
-> reward = extract_reward(info, brain_name)


(Pdb)  


> <ipython-input-40-8519ed924d74>(19)<module>()
-> next_qs = dqn(next_state)


(Pdb)  reward


-0.0009999999310821295


(Pdb)  n


> <ipython-input-40-8519ed924d74>(20)<module>()
-> target = reward + max(next_qs)


(Pdb)  


> <ipython-input-40-8519ed924d74>(21)<module>()
-> pdb.set_trace()


(Pdb)  target


tensor(0.3317, grad_fn=<AddBackward0>)


(Pdb)  next_qs


tensor([ 0.3327, -0.1679,  0.2786, -0.3485,  0.0121,  0.2104, -0.2012],
       grad_fn=<AddBackward0>)


(Pdb)  max(next_qs)


tensor(0.3327, grad_fn=<SelectBackward>)


(Pdb)  q


BdbQuit: 

In [24]:
env.close()

INFO:mlagents.envs:Environment shut down with return code 0.
