In [24]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import gym
import torch.nn.functional as F

In [25]:
class ReplayBuffer():
    def __init__(self, size, obs_dim):
        self.obs_buf = np.zeros((size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(size, dtype=np.int)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.next_obs_buf = np.zeros((size, obs_dim), dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.bool)
        self.buf_size = size
        self.cntr = 0

    def store(self, obs, act, rew, next_obs, done):
        """
        obs :: torch tensor shape==(channels, height, width)
        act :: int
        rew :: int
        obs_ :: torch tensor shape==(channels, height, width)
        done :: bool
        """

        idx = self.cntr % self.buf_size
        self.obs_buf[idx] = obs
        self.act_buf[idx] = act
        self.rew_buf[idx] = rew
        self.next_obs_buf[idx] = next_obs
        self.done_buf[idx] = done 
        self.cntr += 1

    def sample(self, batch_size):
        max_idx = min(self.buf_size, self.cntr)
        idxs = np.random.choice(max_idx, batch_size, replace=False)
        obs_batch = self.obs_buf[idxs]
        act_batch = self.act_buf[idxs]
        rew_batch = self.rew_buf[idxs]
        next_obs_batch = self.next_obs_buf[idxs]
        done_batch = self.done_buf[idxs]
        data = dict(obs=obs_batch, act=act_batch, rew=rew_batch, next_obs=next_obs_batch, done=done_batch)
        data['obs'] = torch.tensor(data['obs'], dtype=torch.float32)
        data['next_obs'] = torch.tensor(data['next_obs'], dtype=torch.float32)
        data['rew'] = torch.tensor(data['rew'], dtype=torch.float32)
        data['act'] = torch.tensor(data['act'], dtype=torch.long)
        data['done'] = torch.tensor(data['done'], dtype=torch.bool)
        return data

In [26]:
class DeepQNetwork(nn.Module):
    def __init__(self, obs_dim, num_acts, lr=1e-3):
        super(DeepQNetwork, self).__init__()
        
        self.linear1 = nn.Linear(obs_dim, 256)
        self.linear2 = nn.Linear(256, 64)
        self.linear3 = nn.Linear(64, num_acts)
        
    def forward(self, obs):
        h = F.relu(self.linear1(obs))
        h = F.relu(self.linear2(h))
        acts = self.linear3(h)

        return acts

In [27]:
class DQNAgent(object):
    def __init__(self, obs_dim, num_acts, gamma=0.99, lr=0.001, buf_size=100000, batch_size=64, eps_min=0.01, eps_dec=1e-5,
        target_replace=100, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.lr = lr

        self.num_acts = num_acts
        self.act_space = [i for i in range(num_acts)]

        self.obs_dim = obs_dim
        
        self.eps = 1
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        
        self.target_replace = target_replace
        self.learn_cntr = 0

        self.buf = ReplayBuffer(buf_size, obs_dim)
        self.batch_size = batch_size

        self.q_eval = DeepQNetwork(obs_dim, num_acts, lr=lr)
        self.q_next = DeepQNetwork(obs_dim, num_acts, lr=lr)
        self.q_next.eval()

        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr)

        self.chkpt_dir = chkpt_dir
        
        
    def choose_act(self, obs):
        """
        obs :: numpy array (channels, width, height)
        """
        if np.random.random() > self.eps:
            obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
            vals = self.q_eval.forward(obs)
            act = torch.argmax(vals).item()
        else:
            act = np.random.choice(self.act_space)

        return int(act)
    
    
    def store_transition(self, obs, act, rew, next_obs, done):
        self.buf.store(obs, act, rew, next_obs, done)
        
        
    def sample_memory(self):
        data = self.buf.sample(self.batch_size)
        return data
    
    
    def replace_target_network(self):
        if self.learn_cntr % self.target_replace == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

            
    def decrement_epsilon(self):
        self.eps = self.eps - self.eps_dec if self.eps > self.eps_min else self.eps_min
        
    
    def learn(self):
        if self.buf.cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        self.replace_target_network()

        data = self.sample_memory()
        obs, act, rew, next_obs, done = data['obs'], data['act'], data['rew'], data['next_obs'], data['done']
        idxs = torch.tensor(np.arange(self.batch_size), dtype=torch.long)

        q_pred = self.q_eval.forward(obs)[idxs, act]
        q_next = self.q_next.forward(next_obs).max(dim=1)[0]

        q_next[done] = 0.0
        q_target = rew + self.gamma * q_next

        loss = self.loss(q_target, q_pred)
        loss.backward()
        self.optimizer.step()
        self.learn_cntr += 1

        self.decrement_epsilon()


    #need to still implement saving parameters function

In [28]:
def train(agent, env_name, epochs=500):            

    n_steps = 0
    scores, steps = [], []
    best_score = -np.inf
    avg_scores = []
    env = gym.make(env_name)

    for epoch in range(epochs):

        obs = env.reset()

        score = 0
        n_steps = 0
        done = False
        while not done:

            act = agent.choose_act(obs)
            next_obs, rew, done, _ = env.step(act)

            agent.store_transition(obs, act, rew, next_obs, done)
            agent.learn()

            obs = next_obs #important

            score += rew   
            n_steps += 1
            
        scores.append(score)
        steps.append(n_steps)

        avg_score = np.mean(scores[-100:])
        best_score = score if score > best_score else best_score
        avg_scores.append(avg_score)
        
        if (epoch+1) % 10 == 0:
            print('Epoch: ', epoch+1, 'average score %.3f' % avg_score, 'best score %.2f' % best_score,
                'epsilon %.2f' % agent.eps)

    env.close()
    return avg_scores

In [29]:
ENV_NAME = "LunarLander-v2"
agent = DQNAgent(8, 4)

In [30]:
avg_scores = train(agent, ENV_NAME)

Epoch:  10 average score -171.064 best score -57.83 epsilon 0.96
Epoch:  20 average score -151.328 best score -57.83 epsilon 0.91
Epoch:  30 average score -136.044 best score -57.83 epsilon 0.87
Epoch:  40 average score -136.143 best score -57.83 epsilon 0.82
Epoch:  50 average score -135.534 best score -57.83 epsilon 0.77
Epoch:  60 average score -137.466 best score -33.68 epsilon 0.73
Epoch:  70 average score -129.632 best score 42.79 epsilon 0.67
Epoch:  80 average score -120.187 best score 42.79 epsilon 0.62
Epoch:  90 average score -113.514 best score 56.66 epsilon 0.56
Epoch:  100 average score -116.282 best score 56.66 epsilon 0.49
Epoch:  110 average score -113.019 best score 56.66 epsilon 0.21
Epoch:  120 average score -104.553 best score 56.66 epsilon 0.01
Epoch:  130 average score -101.292 best score 56.66 epsilon 0.01
Epoch:  140 average score -91.265 best score 61.54 epsilon 0.01
Epoch:  150 average score -84.725 best score 61.54 epsilon 0.01
Epoch:  160 average score -77.

KeyboardInterrupt: 

In [None]:
plt.plot(np.arange(0,len(avg_scores)), avg_scores)
plt.xlabel('No. of games played')
plt.ylabel('Avg. returns')
plt.show()
print('done')

In [8]:
def test(agent, env_name, num_games=5):
    env = gym.make(env_name)

    for game in range(num_games):

        obs = env.reset()

        done = False
        while not done:
            act = agent.choose_act(obs)
            obs, rew, done, _ = env.step(act)
            env.render()
            

    env.close()

In [9]:
test(agent, ENV_NAME)