In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import gym
import torch.nn.functional as F

In [2]:
class ReplayBuffer():
    def __init__(self, size, obs_dim):
        self.obs_buf = np.zeros((size, *obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(size, dtype=np.int)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.next_obs_buf = np.zeros((size, *obs_dim), dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.bool)
        self.buf_size = size
        self.cntr = 0

    def store(self, obs, act, rew, next_obs, done):
        """
        obs :: torch tensor shape==(channels, height, width)
        act :: int
        rew :: int
        obs_ :: torch tensor shape==(channels, height, width)
        done :: bool
        """

        idx = self.cntr % self.buf_size
        self.obs_buf[idx] = obs
        self.act_buf[idx] = act
        self.rew_buf[idx] = rew
        self.next_obs_buf[idx] = next_obs
        self.done_buf[idx] = done 
        self.cntr += 1

    def sample(self, batch_size):
        max_idx = min(self.buf_size, self.cntr)
        idxs = np.random.choice(max_idx, batch_size, replace=False)
        obs_batch = self.obs_buf[idxs]
        act_batch = self.act_buf[idxs]
        rew_batch = self.rew_buf[idxs]
        next_obs_batch = self.next_obs_buf[idxs]
        done_batch = self.done_buf[idxs]
        data = dict(obs=obs_batch, act=act_batch, rew=rew_batch, next_obs=next_obs_batch, done=done_batch)
        data['obs'] = torch.tensor(data['obs'], dtype=torch.float32)
        data['next_obs'] = torch.tensor(data['next_obs'], dtype=torch.float32)
        data['rew'] = torch.tensor(data['rew'], dtype=torch.float32)
        data['act'] = torch.tensor(data['act'], dtype=torch.long)
        data['done'] = torch.tensor(data['done'], dtype=torch.bool)
        return data

In [3]:
class DeepQNetwork(nn.Module):
    def __init__(self, obs_dim, num_acts, lr=1e-3):
        super(DeepQNetwork, self).__init__()
        
        in_channels = obs_dim[0]

        self.conv1 = nn.Conv2d(in_channels, 32, 8, stride=2)
        self.conv2 = nn.Conv2d(32, 64, 5, stride=2)
        self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
        
        linear_input_dims = self._calc_conv_output_dims(obs_dim)
        
        self.linear1 = nn.Linear(linear_input_dims, 100)
        self.linear2 = nn.Linear(100, num_acts)
        
        
    def _calc_conv_output_dims(self, input_dims):
        tmp = torch.zeros((1, *input_dims))
        tmp = self.conv1(tmp)
        tmp = self.conv2(tmp)
        tmp = self.conv3(tmp)
        return int(np.prod(tmp.size()))
    
    
    def forward(self, obs):
        h = F.relu(self.conv1(obs))
        h = F.relu(self.conv2(h))
        h = F.relu(self.conv3(h))
        # flatten conv layer output
        h = h.view(h.size()[0], -1)
        # conv_state shape is BS x (n_filters * H * W)
        h = F.relu(self.linear1(h))
        acts = self.linear2(h)

        return acts

In [4]:
import numpy as np

class DQNAgent(object):
    def __init__(self, obs_dim, num_acts, gamma=0.9, lr=1e-3, buf_size=10000, batch_size=50, eps=1, eps_min=0.01, eps_dec=1e-5,
        target_replace=10, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.lr = lr

        self.num_acts = num_acts
        self.act_space = [i for i in range(num_acts)]

        self.obs_dim = obs_dim
        
        self.eps = eps
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        
        self.target_replace = target_replace
        self.learn_cntr = 0

        self.buf = ReplayBuffer(buf_size, obs_dim)
        self.batch_size = batch_size

        self.q_eval = DeepQNetwork(obs_dim, num_acts, lr=lr)
        self.q_next = DeepQNetwork(obs_dim, num_acts, lr=lr)
        self.q_next.eval()

        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr)

        self.chkpt_dir = chkpt_dir
        
        
    def choose_act(self, obs):
        """
        obs :: numpy array (channels, width, height)
        """
        if np.random.random() > self.eps:
            obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
            vals = self.q_eval.forward(obs)
            act = torch.argmax(vals).item()
        else:
            act = np.random.choice(self.act_space)

        return int(act)
    
    
    def store_transition(self, obs, act, rew, next_obs, done):
        self.buf.store(obs, act, rew, next_obs, done)
        
        
    def sample_memory(self):
        data = self.buf.sample(self.batch_size)
        return data
    
    
    def replace_target_network(self):
        if self.learn_cntr % self.target_replace == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

            
    def decrement_epsilon(self):
        self.eps = self.eps - self.eps_dec if self.eps > self.eps_min else self.eps_min
        
    
    def learn(self):
        if self.buf.cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        self.replace_target_network()

        data = self.sample_memory()
        obs, act, rew, next_obs, done = data['obs'], data['act'], data['rew'], data['next_obs'], data['done']
        idxs = torch.tensor(np.arange(self.batch_size), dtype=torch.long)

        q_pred = self.q_eval.forward(obs)[idxs, act]
        q_next = self.q_next.forward(next_obs).max(dim=1)[0]

        q_next[done] = 0.0
        q_target = rew + self.gamma * q_next

        loss = self.loss(q_target, q_pred)
        loss.backward()
        self.optimizer.step()
        self.learn_cntr += 1

        self.decrement_epsilon()


In [5]:
def get_obs(screen):
    # create transcormation that transforms imag to grayscale
    screen = np.ascontiguousarray(screen)
    screen = screen.transpose(2,0,1)
    screen = screen.mean(axis=0)

    # cut track features from image
    obs = screen[:66, 15:81]
    obs = obs - obs.mean()
    obs = obs / obs.max()

    return obs


In [12]:
def train(agent, env_name, epochs=1000):

    ACTION_SPACE = [[-1,0,0],[0,0,0],[1,0,0],[0,1,0],[0,0,1]]
                    

    def step(env, act):
        acc_rew = 0
        acc_obs = []
        for i in range(agent.obs_dim[0]):
            screen, rew, done, _ = env.step(ACTION_SPACE[act])
            obs = get_obs(screen)
            obs = np.expand_dims(obs, axis=0)
            acc_obs.append(obs)
            acc_rew += rew

        acc_obs = np.concatenate(acc_obs, axis=0)
        return acc_obs, acc_rew, done, _

    n_steps = 0
    scores, steps = [], []
    best_score = -np.inf

    for epoch in range(epochs):
        env = gym.make(env_name)

        screen = env.reset()

        # wait for zoom
        for i in range(12):
            obs, _, _, _ = step(env, 1)

        score = 0
        n_steps = 0
        done = False
        while not done:

            act = agent.choose_act(obs)
            next_obs, rew, done, _ = step(env, act)

            agent.store_transition(obs, act, rew, next_obs, done)
            agent.learn()

            obs = next_obs

            score += rew   
            n_steps += 1
            
        scores.append(score)
        steps.append(n_steps)

        avg_score = np.mean(scores[-10:])
        best_score = score if score > best_score else best_score
        
        print('Epoch: ', epoch, 'average score %.3f' % avg_score, 'best score %.2f' % best_score,
            'epsilon %.2f' % agent.eps)

        env.close()

In [15]:
ENV_NAME = "CarRacing-v0"
agent = DQNAgent((4, 66, 66), 5, eps_dec=5e-6)

In [16]:
train(agent, ENV_NAME, epochs=1200)

ilon 0.41
Track generation: 1008..1268 -> 260-tiles track
Epoch:  499 average score -76.218 best score 26.42 epsilon 0.41
Track generation: 1208..1514 -> 306-tiles track
Epoch:  500 average score -76.599 best score 26.42 epsilon 0.40
Track generation: 1136..1424 -> 288-tiles track
Epoch:  501 average score -76.834 best score 26.42 epsilon 0.40
Track generation: 1208..1514 -> 306-tiles track
Epoch:  502 average score -77.215 best score 26.42 epsilon 0.40
Track generation: 1135..1423 -> 288-tiles track
Epoch:  503 average score -77.540 best score 26.42 epsilon 0.40
Track generation: 1376..1724 -> 348-tiles track
Epoch:  504 average score -77.652 best score 26.42 epsilon 0.40
Track generation: 996..1251 -> 255-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1347..1688 -> 341-tiles track
Epoch:  505 average score -78.241 best score 26.42 epsilon 0.40
Track generation: 1227..1538 -> 311-tiles track
Epoch:  506 average score -78.2

In [None]:
agent.eps = 1
train(agent, ENV_NAME, epochs=1200)

In [None]:
agent.eps = 1
train(agent, ENV_NAME, epochs=1500)

In [26]:
def test(agent, env_name, num_games=5, random=False):

    ACTION_SPACE = [[-1,0,0],[0,0,0],[1,0,0],[0,1,0],[0,0,1],[-0.5,0,0],[0.5,0,0],[-0.5,1,0],[0.5,1,0],[-0.5,0,1],[0.5,0,1]]

    def step(env, act):
        acc_rew = 0
        acc_obs = []
        for i in range(agent.obs_dim[0]):
            screen, rew, done, _ = env.step(ACTION_SPACE[act])
            env.render()
            obs = get_obs(screen)
            obs = np.expand_dims(obs, axis=0)
            acc_obs.append(obs)
            acc_rew += rew

        acc_obs = np.concatenate(acc_obs, axis=0)
        return acc_obs, acc_rew, done, _

    for game in range(num_games):
        env = gym.make(env_name)

        screen = env.reset()

        # wait for zoom
        for i in range(12):
            obs, _, _, _ = step(env, 1)

        done = False
        while not done:
            if random:
                act = env.action_space.sample()
                env.step(act)
                env.render()
            else:
                act = agent.choose_act(obs)
                obs, rew, done, _ = step(env, act)
            

        env.close()

In [36]:
test(agent, ENV_NAME, num_games=3, random=False)

Track generation: 1087..1363 -> 276-tiles track
