In [None]:
%matplotlib inline

In [2]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('CarRacing-v0')

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np

In [2]:
class DQN(nn.Module): 
    def __init__(self, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size = 7, stride = 3)
        self.bn1 = nn.BatchNorm2d(8)
        self.conv2 = nn.Conv2d(8, 32, kernel_size = 3)
        self.bn2 = nn.BatchNorm2d(32)
        self.fc1 = nn.Linear(32 * 3 * 3, 256)
        self.fc2 = nn.Linear(256, outputs)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x)
        # Flatten the input
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        # Softmax activation for the last layer         
        x = F.softmax(self.fc2(x))
        
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]   # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [3]:
model = DQN(10)

print(model)

DQN(
  (conv1): Conv2d(1, 8, kernel_size=(7, 7), stride=(3, 3))
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=288, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=10, bias=True)
)


In [4]:
from collections import namedtuple, deque

class History:
    def __init__(self, capacity, batch_size, seed):
        self.capacity = capacity
        self.memory = deque(maxlen = capacity)
        self.transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))
        self.batch_size = batch_size
        self.seed = random.seed(seed)

    def push(self, state, action, reward, next_state, done):
        """Saves a transition."""
        t = self.transition(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self, batch_size):
        transitions = random.sample(self.memory, self.batch_size)
        
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        
        return (states, actions, reqrds, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [7]:
import numpy as np
import itertools as it
import random
from skimage import color, transform
from collections import namedtuple, deque
from model import DQN
from experience_history import History

import torch
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.global_counter = 0

        # Training Parameters
        self.batch_size = 64
        self.image_size = (96, 96)
        self.gamma = 0.95 
        self.initial_epsilon = 1.0
        self.min_epsilon = 0.1
        self.epsilon_decay_steps = int(1e6)
        self.learning_rate = 4e-4
        self.tau = 1e-3
        self.network_update_frequency = 4

        # Enviroment
        self.render = True
        self.seed = 7    # Seed to random 

        # Possible Actions and their corresponding weights
        left_right = [-1, 0, 1]
        acceleration = [1, 0]
        brake = [0.3, 0]
        all_actions = np.array([action for action in it.product(left_right, acceleration, brake)])
        self.action_map = all_actions
        self.num_actions = len(self.action_map)
        gas_actions = [a[1] == 1 and a[2] == 0 for a in self.action_map]
        # Increase the weight of gas actions for the car.
        self.action_weights = 14 * gas_actions + 1
        self.action_weights /= np.sum(self.action_weights)


        # Model (Neural Network)
        self.training_model = DQN(self.num_actions)
        self.target_model = DQN(self.num_actions)
        self.optimizer = optim.Adam(self.training_model.parameters(), lr = self.learning_rate)

        # Negative Reward
        # To check if we want to end the episode earlier
        self.neg_reward_counter = 0
        self.max_neg_rewards = 100

        # History
        self.experience_capacity = int(1e5)
        self.memory = History(self.experience_capacity, self.batch_size, self.seed)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.network_update_frequency
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def get_action(self, state):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.training_model.eval()
        with torch.no_grad():
            action_values = self.training_model(state)

        self.training_model.train()

        # Epsilon-greedy action selection
        if random.random() > self.get_epsilon():
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return self.get_random_action()

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.target_model(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.training_model(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.training_model, self.target_model)                     

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
          
    def get_epsilon(self):
        if self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # linear decay
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r
          
    def play_episode(self):
        state = self.env.reset()
        score = 0
        scores = []
        self.global_counter = 0
        while True:
            self.global_counter += 1
            action = self.get_action(state)
            next_state, reward, done, _ = self.env.step(action)
            self.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 

        scores.append(score)              # save most recent score
        return 
    
  
  # Convert RGB Image to grayscale and de
    def process_image(self, img):
        return 2 * color.rgb2gray(img) - 1

  # Returns a random action.
    def get_random_action(self):
        return np.random.choice(self.num_actions, p=self.action_weights)
  
    def check_early_stop(self, reward, total_reward):
    if reward < 0:
        self.neg_reward_counter += 1
        done = (self.neg_reward_counter > self.max_neg_rewards)

        if done and total_reward <= 500:
            punishment = -20.0
        else:
            punishment = 0.0

        if done:
            self.neg_reward_counter = 0

        return done, punishment
    else:
        self.neg_reward_counter = 0
        return False, 0.0

        
        

SyntaxError: 'break' outside loop (cell_name, line 154)

In [7]:
env.reset()

for _ in range(100):
    env.render()
    observation, reward, done, info = env.step(env.action_space.sample()) # take a random action

Track generation: 1199..1503 -> 304-tiles track


In [10]:
env.close()

In [12]:
from skimage import color, transform

In [13]:
observation = 2 * color.rgb2gray(observation) - 1

In [19]:
print(observation.shape)

new_ob = observation[np.newaxis, ...]
print(new_ob.shape)
new_ob

(96, 96)
(1, 96, 96)


array([[[ 0.37232   ,  0.37232   ,  0.37232   , ...,  0.37232   ,
          0.37232   ,  0.37232   ],
        [-0.2       , -0.2       ,  0.37232   , ...,  0.37232   ,
          0.37232   ,  0.37232   ],
        [-0.2       , -0.2       , -0.16078431, ...,  0.37232   ,
          0.37232   ,  0.37232   ],
        ...,
        [-1.        , -1.        , -1.        , ..., -1.        ,
         -1.        , -1.        ],
        [-1.        , -1.        , -1.        , ..., -1.        ,
         -1.        , -1.        ],
        [-1.        , -1.        , -1.        , ..., -1.        ,
         -1.        , -1.        ]]])

In [None]:
info

In [None]:
env.close()

In [None]:
plt.imshow(b_w_image)

In [None]:
transformed_image = transform.rescale(observation, 0.5)

In [None]:
plt.imshow(transformed_image)

In [None]:
img = process_image(observation)
img.shape

In [None]:
img

In [None]:
plt.imshow(img, cmap=plt.cm.gray)

In [None]:
observation[]

In [None]:
a = [[[1,2],[3,4]],[[5,6],[7,8]]]

In [None]:
a

In [None]:
a[1:7]

In [None]:
-np.ones(100, dtype="int32")

In [None]:
int(1e5)

In [None]:
env.action_space

In [3]:
import itertools as it
all_actions = np.array([k for k in it.product([-1, 0, 1], [1, 0], [0.2, 0])])

In [4]:
gas_actions = np.array([a[1] == 1 and a[2] == 0 for a in all_actions])

In [5]:
gas_actions

array([False,  True, False, False, False,  True, False, False, False,
        True, False, False])

In [6]:
action_weights = 14.0 * gas_actions + 1.0
print(action_weights)
action_weights /= np.sum(action_weights)
np.random.choice(len(all_actions), p=action_weights)

[ 1. 15.  1.  1.  1. 15.  1.  1.  1. 15.  1.  1.]


9

In [None]:
action_weights

In [None]:
possible_actions

In [None]:
#     def __init__(self):
#         self.capacity = int(1e5)
#         self.image_size = (96, 96)
        
#         self.mem_idx = 0
#         self.num_frame_in_stack = 4
#         self.sliding_window = None
#         self.expecting_new_episode = True
        
#         self.max_frame_cache = self.capacity + 2 * self.num_frame_in_stack + 1
#         self frames = -np.ones((self.max_frame_cache,) + self.image_size, dtype = 'float32')
        
#         self.memory = []
        
#     def push(self, frame, action, done, reward):
#         if len(self.memory) < self.capacity:
#             self.memory.append(None)
        
#         self.state =
#         self.memory[self.mem_idx] = Transition()
#         # Allocate memory