## Cart Pole 

* The agent has to decide between two actions $\mathcal{A}=\{r,l\}$ moving to the right or moving to the left.
* Rewards: Better performing scenarios will run longer duration accumulating larger return.
  $$r_{t} = +1, s\in\mathcal{Q}$$
 where $\mathcal{Q}$ is the desired range of motion that is considered ideal for the agent. This ranges is not moving more than 2.4 units away from the center and not letting the pole fall over too far.
 * Agent inputs: 4 real values represent the environment state (position, velocty)

In [2]:
#Packages
import gym
import math
import random
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('CartPole-v0').unwrapped

#setting up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
    
plt.ion()

#setup torch device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Replay Buffer
* `Transition`: a named tuple that represents a transition in the environment $(s_t,a_t,r_t,s_{t+1})$
* `State`: since it is from pixels, the state is the screen difference image from the previous frame to the current frame
* `ReplayMemory`: is the stack that accumulates the transitions

In [3]:
Transition = namedtuple('Transition', ('state','action','next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen = capacity)
        
    def push(self, *args):
        self.memory.append(Tansitions(*args))
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

## DQN Algorithm


In [7]:
class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32,32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride + 1
        
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))# ??
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)
        
    
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))#functional ??
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1)) #??
    
dqn = DQN(10,10,2)
print(dqn)

DQN(
  (conv1): Conv2d(3, 16, kernel_size=(5, 5), stride=(2, 2))
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(2, 2))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 32, kernel_size=(5, 5), stride=(2, 2))
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (head): Linear(in_features=128, out_features=2, bias=True)
)
