In [9]:
import torch 
import torch.nn as nn
import numpy as np
import time 
from collections import namedtuple
import random
import gym

In [16]:
env = gym.make('CartPole-v1')
env.unwrapped

observation_space = env.observation_space.shape[0]     # observation = Box(4,)
action_space = env.action_space.n                      # 2 actions : 0 means left, 1 means right

In [8]:
xp = namedtuple('Experience', ('state', 'action', 'reward', 'nextstate'))

e = xp(2,1,3,4)
e

Experience(state=2, action=1, reward=3, nextstate=4)

nn package contains all components needed to build networks. within nn package there is a class called module. It is the base class for all NN modules so our network will extend the nn.Module class
We define our network as a class that extends nn.Module

In [18]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        
        super(DQN, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.linear1 = nn.Linear(state_size, 100)
        self.linear2 = nn.Linear(100, action_size)
        
        
    def forward(self, state):
        
        hidden = nn.functional.relu(self.linear1(state))
        q_prediction = self.linear(hidden)
        return q_prediction

In [22]:
class Replaybuffer():
    
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.xp_count = 0
        
    
    def add_xp(self, xp):
        if len(self.memory) < self.capacity:
            self.memory.append(xp)
        else:
            self.memory[self.xp_count % self.capacity] = xp
        self.xp_count += 1
        
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    

    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

In [23]:
def e_greedy(behaviour_dqn, epsilon, state):
    
    if np.random.uniform(0,1) > epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad:
            qvalues = behaviour_dqn.forward(state)
            return torch.argmax(qvalues)

In [19]:
behaviour_dqn = DQN(observation_space, action_space)
target_dqn = DQN(observation_space, action_space)

target_dqn.load_state_dict(behaviour_dqn.state_dict())

<All keys matched successfully>

In [20]:
alpha = 0.003
optimizer = torch.optim.Adam(behaviour_dqn.parameters(), lr = alpha)

In [None]:
n_episodes = 1000
epsilon = 0.9


for episode in range(n_episodes):
    
    state = env.reset()
    done = False
    exp_replay = Replaybuffer(50)
    
    # get the complete episode
    while not done:
        
        action = e_greedy(behaviour_dqn, epsilon, state)
        nextstate, reward, done, _ = env.step()
        exp_replay.add_xp(xp(state, action, reward, nextstate))
        if done:
            break
        state = nextstate
        
        
        
        