In [None]:
"""Implementing Rainbow-IQN
   a Reinforcement Learning algorithm
"""

In [None]:
load = False # set load = True to resume training

In [None]:
import torch
torch.manual_seed(1337)
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
import numpy as np
np.random.seed(1337)
import gc
import io
import cv2
import imageio
from PIL import Image
from PIL import ImageFile
import matplotlib.pyplot as plt
%matplotlib inline
ImageFile.LOAD_TRUNCATED_IMAGES = True
from IPython.display import clear_output
import socket
import time
from math import sqrt

In [None]:
class PrioritizedReplayBuffer(object):
    
    def __init__(self, capacity, load, alpha=0.6):
        self.capacity   = capacity
        self.alpha      = alpha
        if load:
            self.buffer     = np.array(np.load("store/buffer.npy", allow_pickle=True), dtype= np.float32)
            self.priorities = np.array(np.load("store/priorities.npy", allow_pickle=True), dtype=np.float32)
            self.position   = int(np.load("store/position.npy", allow_pickle=True))
        else: 
            self.buffer     = np.array([(0,0,0,0)], dtype= np.float32)
            self.priorities = np.zeros((self.capacity), dtype= np.float32)
            self.position   = 0
            
    def push(self, current_state, action, new_state, reward):   
        self.priorities[self.position] = self.priorities.max() if len(self.buffer) else 1.0    
        if len(self.buffer) < self.capacity:
            self.buffer = np.vstack((self.buffer, (current_state, action, new_state, reward)))
        else:
            self.buffer[self.position] = (current_state, action, new_state, reward)   
        self.position = (self.position + 1) % self.capacity
        return 
            
    def sample(self, size, beta=0.4):
        prios       = self.priorities[:] if len(self.buffer) == self.capacity else self.priorities[:self.position]
        prios       = prios**self.alpha
        prios       = np.nan_to_num(prios)
        prios_sum   = np.sum(prios)
        prios       = prios / prios_sum
        prb         = None if np.isnan(prios).any() else prios
        indices     = np.random.choice(len(self.buffer), size, p=prb)
        states      = [self.buffer[id][0] for id in indices]
        actions     = [self.buffer[id][1] for id in indices]
        next_states = [self.buffer[id][2] for id in indices]
        rewards     = [self.buffer[id][3] for id in indices]
        total       = len(self.buffer)
        weights     = np.array([(total * prios[id]) ** (-beta) for id in indices], dtype=np.float32)
        weights_max = weights.max()
        weights     = weights / weights_max
        return states, actions, next_states, rewards, indices, weights
        
    def update_priorities(self, batch_idx, batch_prios):  
        for b_idx, b_prios in zip(batch_idx, batch_prios):    
            self.priorities[b_idx] = b_prios
            
    def __len__(self):
        return len(self.buffer)

In [None]:
class NoisyLinear(nn.Module):
    
    def __init__(self, in_features, out_features, std_init=0.4):
        super(NoisyLinear, self).__init__()
        self.in_features  = in_features
        self.out_features = out_features
        self.std_init     = std_init
        self.weight_mu    = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
        self.bias_mu      = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma   = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
        self.reset_parameters()
        self.reset_noise()
    
    def forward(self, x):
        weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon)
        bias   = self.bias_mu   + self.bias_sigma.mul(self.bias_epsilon)
        return F.linear(x, weight, bias)
    
    def reset_parameters(self):
        mu_range = 1 / sqrt(self.weight_mu.size(1))
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / sqrt(self.weight_sigma.size(1)))
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / sqrt(self.bias_sigma.size(0)))
    
    def reset_noise(self):
        epsilon_in  = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(self._scale_noise(self.out_features))
    
    def _scale_noise(self, size):
        x = torch.randn(size)
        x = x.sign().mul(x.abs().sqrt())
        return x

In [None]:
class Agent(nn.Module): 
    
    def __init__(self, input_shape, num_atoms, num_actions = 4):
        super(Agent, self).__init__()
        
        self.input_shape = input_shape
        self.num_actions = num_actions
        self.num_atoms   = num_atoms
        self.features    = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size = 8, stride = 4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size = 4, stride = 2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size = 3, stride = 1),
            nn.ReLU())
        self.noisy_value1     = NoisyLinear(self.features_size(), 512)
        self.noisy_value2     = NoisyLinear(512, self.num_atoms)
        self.noisy_advantage1 = NoisyLinear(self.features_size(), 512)
        self.noisy_advantage2 = NoisyLinear(512, self.num_atoms * self.num_actions)
        
    def features_size(self):
        return self.features(torch.zeros(1, *self.input_shape)).view(1, -1).size(1)
    
    def forward(self, x):
        batch_size = x.size(0)
        x          = self.features(x)
        x          = x.view(batch_size, -1) 
        value      = F.relu(self.noisy_value1(x))
        value      = self.noisy_value2(value)
        advantage  = F.relu(self.noisy_advantage1(x))
        advantage  = self.noisy_advantage2(advantage)
        value      = value.view(batch_size, 1, self.num_atoms)
        advantage  = advantage.view(batch_size, self.num_actions, self.num_atoms)
        x          = value + advantage - advantage.mean(1, keepdim=True)
        x          = x.view(-1, self.num_actions, self.num_atoms)
        return x
        
    def reset_noise(self):
        self.noisy_value1.reset_noise()
        self.noisy_value2.reset_noise()
        self.noisy_advantage1.reset_noise()
        self.noisy_advantage2.reset_noise()
        
    def act(self, state, epsilon):
        if np.random.rand() > epsilon:
            with torch.no_grad():
                state   = torch.FloatTensor(state).unsqueeze(0)
            qvalues = self.forward(state).mean(2)
            action  = qvalues.max(1)[1]
            action  = action.data.cpu().numpy()[0]
        else:
            action  = np.random.randint(self.num_actions)
        return action

In [None]:
def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())

In [None]:
def compute_td_loss(batch_size, beta):
    state, action, next_state, reward, indices, weights = replay_buffer.sample(batch_size, beta) 
    state      = torch.FloatTensor(np.float32(state))
    action     = torch.LongTensor(action)
    next_state = torch.FloatTensor(np.float32(next_state))
    reward     = torch.FloatTensor(reward)
    weights    = torch.FloatTensor(weights)
    theta      = current_model(state)[np.arange(batch_size), action]
    Znext      = target_model(next_state).detach()
    Znext_max  = Znext[np.arange(batch_size), Znext.mean(2).max(1)[1]]
    Ttheta     = reward.unsqueeze(1) + gamma * Znext_max
    diff       = Ttheta.t().unsqueeze(-1) - theta 
    huber_diff = torch.where(diff.abs() < 1, 0.5 * diff.pow(2), 1 * (diff.abs() - 0.5 * 1))
    loss       = huber_diff * (tau - (diff.detach() < 0).float()).abs()
    loss       = loss.mean()
    prios      = loss * weights + 1e-3
    optimizer.zero_grad()
    loss.backward()
    replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
    nn.utils.clip_grad_norm_(current_model.parameters(), 0.5)
    optimizer.step()
    current_model.reset_noise()
    target_model.reset_noise()
    return loss

In [None]:
def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title(f'Frame: {frame_idx}')
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('Loss')
    plt.plot(losses)
    plt.show()

In [None]:
def preprocess_frame(binary_data):
    x = None
    x = Image.open(io.BytesIO(binary_data)) 
    x = np.array(x)
    x = cv2.cvtColor(x, cv2.COLOR_BGR2GRAY)
    x = cv2.resize(x, (input_shape[1], input_shape[2]))
    x = x / 255.0
    return x

In [None]:
def save_model():
    torch.save(current_model, "E:/Jupyter files/REINFORCEMENT LEARNIING/CarParkingAI/store/current_model")
    torch.save(target_model, "E:/Jupyter files/REINFORCEMENT LEARNIING/CarParkingAI/store/target_model")

In [None]:
def load_model():
    current_model = torch.load("E:/Jupyter files/REINFORCEMENT LEARNIING/CarParkingAI/store/current_model")
    target_model  = torch.load("E:/Jupyter files/REINFORCEMENT LEARNIING/CarParkingAI/store/target_model")

In [None]:
beta_start     = 0.4
beta_frames    = 10_000
update_beta    = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)

epsilon_decay  = 0.002
min_epsilon    = 0.1
update_epsilon = lambda frame_idx: min_epsilon + (1.0 - min_epsilon) * np.exp(-epsilon_decay * frame_idx)

In [None]:
input_shape         = (4, 84, 84)
num_actions         = 4
BATCH_SIZE          = 256
MINI_BATCH_SIZE     = 32
gamma               = 0.99
num_atoms           = 51
computes_loss_after = 4
plot_after          = 10
copy_weights_after  = 100
MIN_REPLAY_SIZE     = 20_000
tau                 = torch.Tensor((2 * np.arange(num_atoms) + 1) / (2.0 * num_atoms)).view(1, -1)

In [None]:
current_model = Agent(input_shape, num_atoms, num_actions)
target_model  = Agent(input_shape, num_atoms, num_actions)
update_target(current_model, target_model)

optimizer     = optim.Adam(current_model.parameters(), lr=0.0001)

replay_buffer = PrioritizedReplayBuffer(200_000, load)

In [None]:
if load:
    load_model()

In [None]:
def main(load):
    
    counter     = int(np.load("store/counter.npy", allow_pickle=True)) if load else 0    
    epsilon     = 1.0
    all_rewards = np.array([])
    losses      = np.array([])
        
    print("[SERVER] Starting ...") 
    s = socket.socket()
    s.bind(('127.0.0.1', 8010))
    s.listen()
    while True:
        c, addr = s.accept()
        try:
            while True:
                counter       = counter + 1
                total_rewards = 0

                c.sendall("1".encode('ascii')) # send 1 to start receiving data
                for batch_num in range(BATCH_SIZE):
                    old_frame_state = np.zeros(input_shape)
                    new_frame_state = np.zeros(input_shape)
                    for frame_num in range(input_shape[0]):
                        old_frame_len = c.recv(15).decode('utf-8')  #receive old image len first
                        old_frame     = c.recv(int(old_frame_len))  #receive old image
                        old_frame_state[frame_num] = preprocess_frame(old_frame)
                    action = current_model.act(old_frame_state, epsilon)
                    epsilon = update_epsilon(counter)
                    c.sendall(str(action).encode('ascii'))  #send action
                    for frame_num in range(input_shape[0]):
                        new_frame_len = c.recv(15).decode('utf-8')  #receive image len first
                        new_frame     = c.recv(int(new_frame_len))  #receive image
                        new_frame_state[frame_num] = preprocess_frame(new_frame)
                    reward = float(c.recv(10).decode('utf-8')) / 20  #receive reward
                    replay_buffer.push(old_frame_state, action, new_frame_state, reward)
                    total_rewards += reward
                    del old_frame, new_frame, old_frame_state, new_frame_state
                    gc.collect()
                c.sendall("0".encode('ascii')) # send 0 to stop receiving data


                all_rewards = np.append(all_rewards, total_rewards)


                if len(replay_buffer) > MIN_REPLAY_SIZE:
                    if counter % computes_loss_after == 0:
                        beta   = update_beta(counter)
                        loss   = compute_td_loss(MINI_BATCH_SIZE, beta)
                        losses = np.append(losses, loss.item())
                    if counter % copy_weights_after == 0: 
                        update_target(current_model, target_model)   
                    if counter % plot_after == 0: 
                        plot(counter, all_rewards, losses)
                else:
                    print(counter)

        except Exception as e:
            print(e)


        finally:
            np.save("store/counter.npy", counter)
            np.save("store/position.npy", replay_buffer.position)
            np.save("store/buffer.npy", replay_buffer.buffer)
            np.save("store/priorities.npy", replay_buffer.priorities)
            save_model()
            gc.collect()
            c.close()
            return

    print("[SERVER] Stoping ...")

In [None]:
if __name__ == "__main__":
    main(load)