In [1]:
import gym
import random
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque


from time import sleep

#Hyperparameters
lr_mu        = 0.0005
lr_q         = 0.001
gamma        = 0.99
batch_size   = 32
buffer_limit = 100000
tau          = 0.005 # for target network soft update
num_frames = 4

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done = transition
            s_lst.append(s)
            a_lst.append(a)
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0 
            done_mask_lst.append([done_mask])
            
        r_lst = torch.tensor(r_lst, dtype=torch.double, device=device)
        a_lst = torch.tensor(a_lst, dtype=torch.double, device=device)
        s_lst = torch.tensor(s_lst, dtype=torch.double, device=device)
        s_prime_lst = torch.tensor(s_prime_lst, dtype=torch.double, device=device)
        done_mask_lst = torch.tensor(done_mask_lst, dtype=torch.double, device=device)
        
        return s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst
    
    def size(self):
        return len(self.buffer)

    
    
class MuNet(torch.nn.Module):
    def __init__(self, num_frames= 4, action_dim=3):
        super(MuNet, self).__init__()
   
        self.cnn_base = nn.Sequential(  # input shape (4, 96, 96)
            nn.Conv2d(num_frames, 8, kernel_size=4, stride=2),
            nn.ReLU(),  # activation
            nn.Conv2d(8, 16, kernel_size=3, stride=2),  # (8, 47, 47)
            nn.ReLU(),  # activation
            nn.Conv2d(16, 32, kernel_size=3, stride=2),  # (16, 23, 23)
            nn.ReLU(),  # activation
            nn.Conv2d(32, 64, kernel_size=3, stride=2),  # (32, 11, 11)
            nn.ReLU(),  # activation
            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
            nn.ReLU(),  # activation
            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
            nn.ReLU(),  # activation
        )  # output shape (256, 1, 1)
        self.steer = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1))
        self.gas = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1),nn.Softplus())
        self.break_ = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1),nn.Softplus())        
    
    def forward(self, x):
        
        # Forward pass
        x = self.cnn_base(x)
        x = x.view(-1, 256) 
        steer = self.steer(x)
        gas = self.gas(x)
        break_ = self.break_(x)
        x=torch.cat([steer, gas, break_], dim=1)
        x = torch.tanh(x)*1
     
        return x    
    

    

class QNet(torch.nn.Module):
    def __init__(self, num_frames= 4, action_dim=3):
        super(QNet, self).__init__()
       
        self.cnn_base = nn.Sequential(  # input shape (4, 96, 96)
            nn.Conv2d(num_frames, 8, kernel_size=4, stride=2),
            nn.ReLU(),  # activation
            nn.Conv2d(8, 16, kernel_size=3, stride=2),  # (8, 47, 47)
            nn.ReLU(),  # activation
            nn.Conv2d(16, 32, kernel_size=3, stride=2),  # (16, 23, 23)
            nn.ReLU(),  # activation
            nn.Conv2d(32, 64, kernel_size=3, stride=2),  # (32, 11, 11)
            nn.ReLU(),  # activation
            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
            nn.ReLU(),  # activation
            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
            nn.ReLU(),  # activation
        )  # output shape (256, 1, 1)
        
        self.v = nn.Sequential(nn.Linear(256+action_dim, 200), nn.ReLU(), nn.Linear(200, 1))
    
    def forward(self, x,a):
        
        # Forward pass
        x = self.cnn_base(x)
        x = x.view(-1, 256) 
        x = torch.cat([x,a], dim=1)
        x = self.v(x)
       
        
        return x    
        
    
    
      
def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer):
    s,a,r,s_prime,done_mask  = memory.sample(batch_size)
    
    target = r + gamma * q_target(s_prime, mu_target(s_prime)) * done_mask
    q_loss = F.smooth_l1_loss(q(s,a), target.detach())
    q_optimizer.zero_grad()
    q_loss.backward()
    q_optimizer.step()
    
    mu_loss = -q(s,mu(s)).mean() # That's all for the policy loss.
    mu_optimizer.zero_grad()
    mu_loss.backward()
    mu_optimizer.step()
    
def soft_update(net, net_target):
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
    
    
def rgb2gray(rgb, norm=True):
    # rgb image -> gray [0, 1]
    gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
    if norm:
        # normalize
        gray = gray / 128. - 1.
    return gray    
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")      
    
env = gym.make('CarRacing-v0')
#env = gym.wrappers.TimeLimit(env, max_episode_steps = 1000)
env.seed(0)

reward_history = deque(maxlen=100)
for _ in range(100):
    reward_history.append(0.0)
    
    
    
memory = ReplayBuffer()


q, q_target = QNet().double().to(device), QNet().double().to(device)
q_target.load_state_dict(q.state_dict())
mu, mu_target = MuNet().double().to(device), MuNet().double().to(device)
mu_target.load_state_dict(mu.state_dict())

mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)


print_interval = 20
score = 0.0
scores = []
for n_epi in range(3000):
    rgb_image = env.reset()
    gray_image = rgb2gray(rgb_image)
    stack = [gray_image] * num_frames
    state = stack
    s = np.array(stack)
    s = torch.from_numpy(s).double().to(device).unsqueeze(0)
    die = False
        
    while True:
        env.render()
        
        a = mu(s)[0]
        a = a.cpu().detach().numpy() + 0.2*np.random.normal(3)
        action = a

        next_rgb_image, r, die, _ = env.step(a)
        # don't penalize "die state"
        if die:
            r =r + 100 - 0.05
        # green penalty
        if np.mean(rgb_image[:, :, 1]) > 185.0:
            r =r - 0.05

        reward_history.append(r)
        avg_rewards = sum(reward_history) / len(reward_history)
        # if no reward recently, end the episode
        done = True if avg_rewards <= -0.1 else False
        
        score = score + r
        if die or done:
            scores.append(score)
            score = 0.0

            break        
        
        next_gray_image = rgb2gray(next_rgb_image)
        stack.pop(0)
        stack.append(next_gray_image)
        next_state = stack
        next_s = np.array(stack)
        next_s = torch.from_numpy(next_s).double().to(device).unsqueeze(0)
        memory.put((state,action,r,next_state,done))
        
        s = next_s
        state = next_state
        rgb_image = next_rgb_image
                
    if memory.size()>2000:
        for i in range(10):
            train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer)
            soft_update(mu, mu_target)
            soft_update(q,  q_target)
        
    #clear_output(wait=True)
    print("# of episode :{}, avg score : {:.1f}".format(n_epi, avg_rewards))

env.close()

print(sum(scores))



if not os.path.exists("./param"):
    os.makedirs("./param")
torch.save(mu.state_dict(), 'param/DDPG_net_params.pkl')







Track generation: 1143..1442 -> 299-tiles track
# of episode :0, avg score : -0.1
Track generation: 1087..1369 -> 282-tiles track
# of episode :1, avg score : -0.1
Track generation: 964..1212 -> 248-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1176..1474 -> 298-tiles track
# of episode :2, avg score : -0.1
Track generation: 1283..1608 -> 325-tiles track
# of episode :3, avg score : -0.1
Track generation: 1217..1526 -> 309-tiles track
# of episode :4, avg score : -0.1
Track generation: 1096..1374 -> 278-tiles track
# of episode :5, avg score : -0.1
Track generation: 1198..1501 -> 303-tiles track
# of episode :6, avg score : -0.1
Track generation: 1159..1453 -> 294-tiles track
# of episode :7, avg score : -0.1
Track generation: 957..1205 -> 248-tiles track
# of episode :8, avg score : -0.1
Track generation: 1181..1480 -> 299-tiles track
# of episode :9, avg score : -0.1
Track generation: 979..1234 -> 255-tiles track
# of ep

# of episode :83, avg score : 1.0
Track generation: 1224..1534 -> 310-tiles track
# of episode :84, avg score : -0.1
Track generation: 1140..1429 -> 289-tiles track
# of episode :85, avg score : -0.1
Track generation: 1114..1400 -> 286-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1052..1319 -> 267-tiles track
# of episode :86, avg score : -0.1
Track generation: 1045..1313 -> 268-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1131..1418 -> 287-tiles track
# of episode :87, avg score : -0.1
Track generation: 1122..1412 -> 290-tiles track
# of episode :88, avg score : 1.0
Track generation: 1239..1553 -> 314-tiles track
# of episode :89, avg score : -0.1
Track generation: 1022..1283 -> 261-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1123..1408 -> 285-tiles track
# of episode :90, avg score : -0.1
Tr

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

plt.plot(range(len(scores)), np.array(scores), 'b', linewidth = 2, label = 'DDGP')
plt.legend(prop={'size':12})
plt.xlabel('Episode')
plt.ylabel('Total rewards')
#plt.xlim(0, no_of_episodes)
#plt.ylim(0, 20000)
#plt.legend(['Double DQN', 'Dueling DQN', 'D3QN'], loc=4)
plt.grid(True)

In [3]:
# Test
from time import sleep

for n_epi in range(2):
    s = env.reset()
    done = False
        
    while not done:
        a = mu(torch.from_numpy(s).float()) 
        a = a.item() + ou_noise()[0]
        s_prime, r, done, info = env.step([a])
        env.render()
        sleep(0.01)
        score +=r
        s = s_prime
                
        
    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0

env.close()