In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim

## Big problem here!! Action space is continuos. Can not use DQN.

In [2]:
class ReplayBuffer():
    def __init__(self, size, obs_dims):
        self.mem_size = size
        self.obs_mem = torch.zeros((self.mem_size, *obs_dims))
        self.act_mem = torch.zeros(self.mem_size, dtype=torch.int64)
        self.rew_mem = torch.zeros(self.mem_size, dtype=torch.float32)
        self.next_obs_mem = torch.zeros((self.mem_size, *obs_dims))
        self.done_mem = torch.zeros(self.mem_size, dtype=torch.bool)
        self.cntr = 0

    def push(self, obs, act, rew, next_obs, done):
        """
        obs :: torch tensor shape==(channels, height, width)
        act :: int
        rew :: int
        obs_ :: torch tensor shape==(channels, height, width)
        done :: bool
        """

        idx = self.cntr % self.mem_size
        self.obs_mem[idx] = obs
        self.act_mem[idx] = act
        self.rew_mem[idx] = rew
        self.next_obs_mem[idx] = next_obs
        self.done_mem[idx] = done 
        self.cntr += 1

    def sample(self, batch_size):
        max_idx = min(self.mem_size, self.cntr)
        idxs = np.random.choice(max_idx, batch_size, replace=False)
        obs_batch = self.obs_mem[idxs]
        act_batch = self.act_mem[idxs]
        rew_batch = self.rew_mem[idxs]
        next_obs_batch = self.next_obs_mem[idxs]
        done_batch = self.done_mem[idxs]

        return obs_batch, act_batch, rew_batch, next_obs_batch, done_batch

In [3]:
class DeepQNetwork(nn.Module):
    def __init__(self, obs_dims, num_acts, lr=1e-3):
        super(DeepQNetwork, self).__init__()
        
        self.conv1 = nn.Conv2d(obs_dims[0], 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
        
        linear_input_dims = self._calc_conv_output_dims(obs_dims)
        
        self.linear1 = nn.Linear(linear_input_dims, 512)
        self.linear2 = nn.Linear(512, num_acts)
        
        self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        
        
    def _calc_conv_output_dims(self, input_dims):
        tmp = torch.zeros((1, *input_dims))
        tmp = self.conv1(tmp)
        tmp = self.conv2(tmp)
        tmp = self.conv3(tmp)
        return int(np.prod(tmp.size()))
    
    
    def forward(self, obs):
        h = F.relu(self.conv1(obs))
        h = F.relu(self.conv2(h))
        h = F.relu(self.conv3(h))
        # flatten conv layer output
        h = h.view(h.size()[0], -1)
        # conv_state shape is BS x (n_filters * H * W)
        h = F.relu(self.linear1(h))
        acts = self.linear2(h)

        return acts
        
        

In [4]:
class DQNAgent(object):
    def __init__(self, gamma, epsilon, lr, num_acts, obs_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.num_acts = num_acts
        self.obs_dims = obs_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.chkpt_dir = chkpt_dir
        self.act_space = [i for i in range(num_acts)]
        self.learn_cnt = 0

        self.memory = ReplayBuffer(mem_size, obs_dims)
        self.q_eval = DeepQNetwork(self.obs_dims, self.num_acts, self.lr)
        self.q_next = DeepQNetwork(self.obs_dims, self.num_acts, self.lr)
        
        
    def choose_act(self, obs):
        """
        obs :: torch tensor shape==(3, 96, 96)
        """
        if np.random.random() > self.epsilon:
            acts = self.q_eval.forward(obs.unsqueeze(0))
            act = torch.argmax(acts).item()
        else:
            act = np.random.choice(self.act_space)

        return int(act)
    
    
    def store_transition(self, obs, act, rew, next_obs, done):
        self.memory.push(obs, act, rew, next_obs, done)
        
        
    def sample_memory(self):
        obs, act, rew, next_obs, done = self.memory.sample(self.batch_size)
        return obs, act, rew, next_obs, done
    
    
    def replace_target_network(self):
        if self.learn_cnt % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

            
    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
    
    def learn(self):
        if self.memory.cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        obs, act, rew, next_obs, done = self.sample_memory()
        indxs = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(obs)[indxs, act]
        q_next = self.q_next.forward(next_obs).max(dim=1)[0]

        q_next[done] = 0.0
        q_target = rew + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

In [7]:
def train():
    env = gym.make('CarRacing-v0')

    best_score = -np.inf
    load_checkpoint = False
    n_games = 250

    agent = DQNAgent(gamma=0.99, epsilon=1, lr=0.0001,
                     obs_dims=(3, 96, 96),
                     num_acts=3, mem_size=50000, eps_min=0.1,
                     batch_size=32, replace=1000, eps_dec=1e-5,
                     chkpt_dir='models/')

    if load_checkpoint:
        agent.load_models()

    n_steps = 0
    scores, eps_history, steps_array = [], [], []

    for i in range(n_games):
        done = False
        obs = torch.tensor(np.array(env.reset()), dtype=torch.float).permute(0,2,1)

        score = 0
        while not done:
            act = agent.choose_act(obs)
            next_obs, rew, done, info = env.step(act)
            next_obs = torch.tensor(np.array(next_obs), dtype=torch.float).permute(0,2,1)
            score += rew

            agent.store_transition(obs, act, rew, next_obs, done)
            agent.learn()
                
            obs = next_obs
            n_steps += 1
            
        scores.append(score)
        steps_array.append(n_steps)

        avg_score = np.mean(scores[-100:])
        
        print('episode: ', i,'score: ', score,
             ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
            'epsilon %.2f' % agent.epsilon, 'steps', n_steps)

        eps_history.append(agent.epsilon)

In [8]:
train()

TypeError: 'Box' object cannot be interpreted as an integer

In [None]:
env = gym.make('CarRacing-v0')

In [None]:
obs = env.reset()

In [None]:
obs.shape

In [None]:
x = torch.tensor(obs)