In [2]:
import random, copy
import numpy as np
import gym
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from collections import namedtuple

In [3]:
from IPython.core.debugger import set_trace

In [4]:
class ReplayBuffer:
    def __init__(self, size):
        self.size = size
        self.content = []
    
    def __len__(self):
        return len(self.content)
    
    def insert(self, item):
        self.content.append(item)
        if len(self) > self.size:
            self.content.pop(0)
    
    def insert_list(self, items):
        for item in items:
            self.insert(item)
    
    def can_sample(self, N):
        return len(self) >= N
    
    def sample(self, N):
        assert self.can_sample(N)
        return random.sample(self.content, N)

In [5]:
Experience = namedtuple('Experience', field_names = [
    'state', 'action', 'reward', 'next_state', 'done'
])

In [6]:
def generate_episode(env, agent):
    episode = []
    state = env.reset()
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        episode.append(Experience(state=state, action=action, reward=reward,
                                 next_state=next_state, done=done))
        state = next_state
    return episode

In [7]:
class Scheduler:
    def __init__(self, start, stop, decay=0.99):
        self.stop  = stop
        self.decay = decay
        self.value = start
    
    def __call__(self):
        self.value *= self.value * self.decay
        return max(self.value, self.stop)

In [8]:
class Agent(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.env = env
        self.net = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], N_HIDDEN),
            nn.ReLU(),
            nn.Linear(N_HIDDEN, N_HIDDEN),
            nn.ReLU(),
            nn.Linear(N_HIDDEN, env.action_space.n)
        )
        self.target = copy.deepcopy(self.net)
        
        self.optimizer = optim.Adam(self.net.parameters(), lr=LR)
        self.epsilon_sched = Scheduler(start=1.0, stop=0.01, decay=0.99)
    
    def sync(self):
        self.target.load_state_dict(self.net.state_dict())
        
    def forward(self, x):
        x = x.float()
        qvals = self.net(x)
        return qvals
    
    def act(self, state):
        epsilon = self.epsilon_sched()
        if random.random() < epsilon:
            return self.env.action_space.sample()
        else:
            with torch.no_grad():
                qvals = self(torch.tensor([state]))[0]
                action = qvals.max(0)[1].item()
            return action
        
    
    def update(self, batch):
        states, actions, rewards, next_states, dones = zip(*batch)
        states_v = torch.tensor(states)
        current_q = self(states_v)[range(len(batch)), actions]
        rewards_v = torch.tensor(rewards)
        dones_v = torch.FloatTensor(dones)
        #set_trace()
        
        
        next_states_v = torch.tensor(next_states)
        next_q = self(next_states_v)
        next_qmax = next_q.max(1)[0]
        targets = rewards_v + GAMMA * (1.-dones_v) * next_qmax
        
        self.optimizer.zero_grad()
        loss = F.mse_loss(current_q, targets.detach())
        loss.backward()
        self.optimizer.step()

In [15]:
N_HIDDEN    = 128 #24
LR          = 0.01
N_STEPS     = 5000
BUFFER_SIZE = 512
BATCH_SIZE  = 32
ALPHA       = 0.9
GAMMA       = 0.99
SYNC_RATE   = 100

In [16]:
env    = gym.make('CartPole-v0')
agent  = Agent(env)
buffer = ReplayBuffer(size=BUFFER_SIZE)
avg_len = 10 # bookkeeping

for idx in range(N_STEPS):
    episode = generate_episode(env, agent)
    avg_len = ALPHA * avg_len + (1.-ALPHA) * len(episode)
    buffer.insert_list(episode)
    if not buffer.can_sample(BATCH_SIZE):
        continue
    batch = buffer.sample(BATCH_SIZE)
    agent.update(batch)
    if idx % SYNC_RATE == 0:
        print(f"Episode {idx}: Average length = {avg_len}")
        agent.sync()

Episode 100: Average length = 24.971665495158266
Episode 200: Average length = 34.78876528805114
Episode 300: Average length = 40.78865997197225
Episode 400: Average length = 46.88018176375325
Episode 500: Average length = 26.2813359551414
Episode 600: Average length = 140.71216403963194
Episode 700: Average length = 100.02822232694066
Episode 800: Average length = 124.99257685450074
Episode 900: Average length = 140.90898291557437
Episode 1000: Average length = 135.1066908595139
Episode 1100: Average length = 172.87360736411028
Episode 1200: Average length = 193.1844585228062
Episode 1300: Average length = 195.24356170213778
Episode 1400: Average length = 199.53666430579233
Episode 1500: Average length = 49.22004734396908
Episode 1600: Average length = 134.21291862054915
Episode 1700: Average length = 198.9894327308019
Episode 1800: Average length = 178.45415689228196
Episode 1900: Average length = 164.33875880679597
Episode 2000: Average length = 185.92732621951285
Episode 2100: Aver