# [Deep Recurrent Q-Learning for Partially Observable MDPs](https://arxiv.org/pdf/1507.06527.pdf)

In [1]:
import torch
import matplotlib
import matplotlib.pyplot as plt

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
from dataclasses import dataclass

@dataclass
class Config:
    # Torch Parameters
    device: torch.device
    
    # Enviroment Parameters
    n_states: int
    n_actions: int

    # Net Parameters
    hidden_size = 64
    hidden_layers = 1

    # Memory Parameters
    rm_size: int
    
    # Learning Hyperparameters
    batch_size: int
    epochs: int
    lr: float
    gamma: float
    update_rate: int
    epsilon_decay_begin: int
    epsilon_decay_end: int
    epsilon: callable
    save_rate: int
    num_workers: int
    optimize_times: int
    optimize_rate: int


In [3]:
from collections import deque, namedtuple
import random


Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))
Run = namedtuple("Run", ('states', 'actions', 'rewards'))


class ReplayMemory():

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Run(*args))

    def sample(self, batch_size):
        return random.sample(list(self.memory), batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
import torch.nn as nn


class DRQN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=128, hidden_layers=1):
        super(DRQN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, hidden_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
      
    def forward(self, x, i=None):
        if i is not None:
            x, i = self.lstm(x, i)
        else:
            x, i = self.lstm(x)
        return self.out(x), i


In [5]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import random


class DRQN_Agent:
    def __init__(self, net:nn.Module, config:Config, path=None):
        self.config = config

        self.target_net = net(self.config.n_states, self.config.n_actions).to(self.config.device)

        if path:
            self.q_net = torch.load(path).to(self.config.device)
        else:
            self.q_net = net(self.config.n_states, self.config.n_actions).to(self.config.device)

        self.target_net.load_state_dict(self.q_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.config.lr)
        self.memory = ReplayMemory(self.config.rm_size)

        self.criterion = nn.MSELoss()

    def update_target_net(self):
        self.target_net.load_state_dict(self.q_net.state_dict())

    def add_to_memory(self, *args):
        self.memory.push(*args)

    def select_action(self, state, i=None, epsilon=0):
        with torch.no_grad():
            x, i = self.q_net(state.view(-1, 8), i)

        if random.random() < epsilon:
            # Explore: take a random action
            return torch.tensor([random.randrange(self.config.n_actions)], device=self.config.device, dtype=torch.long), i
        else:
            # Exploit: select the highest Q value
            return x.max(1)[1][-1].view(1), i

    def Q(self, state, action):
        state = pad_sequence(state, batch_first=True)
        action = pad_sequence(action, batch_first=True)

        x, _= self.q_net(state)
        return x.gather(2, action.view(self.config.batch_size, -1, 1))

    def target(self, state, reward):
        state = pad_sequence(state, batch_first=True)
        reward = pad_sequence(reward, batch_first=True)

        x, _ = self.target_net(state)
        Q_target = x.max(2)[0].detach()

        
        return (Q_target * self.config.gamma) + reward

    def optimize(self):
        if len(self.memory) < self.config.batch_size:
            return

        batch = Run(*zip(*self.memory.sample(self.config.batch_size)))

        next_state_batch = (i[3:] for i in batch.states)
        state_batch = (i[:-3] for i in batch.states)
        action_batch = batch.actions
        reward_batch = batch.rewards

        self.optimizer.zero_grad()

        # state_batch.shape = (N x L x states)
        # action_batch.shape = (N x L x 1)
        y = self.Q(state_batch, action_batch).view(self.config.batch_size, -1)

        # next_state_batch.shape = (N x L x states)
        # reward_batch.shape = (N x L x 1)
        yl = self.target(next_state_batch, reward_batch).view(self.config.batch_size, -1)

        loss = self.criterion(y, yl)

        loss.backward()
        self.optimizer.step()

        return loss.cpu().data.item()

    def save(self, name):
        torch.save(self.q_net, name)


In [6]:
from itertools import count
import numpy as np
from game import Env
from map_gen import generate_map
import pickle

def get_eps(x):
    return 1-.002*(x)
    
config = Config(
    device = device,
    n_states = 8,
    n_actions = 6,
    rm_size = 100,
    batch_size = 64,
    epochs = 3000,
    update_rate = 100,
    lr = 0.002,
    gamma = 0.8,
    epsilon = get_eps,
    epsilon_decay_begin = 1000,
    epsilon_decay_end = 2000,
    save_rate = 100,
    optimize_rate = 2,
    num_workers = 1,
    optimize_times = 10
)



train_log = {
    'ep': [],
    'reward': [],
    'loss': [],
    'size': [],
    'n_obs': []
}

agent = DRQN_Agent(DRQN, config, path="net_last.pt")

with open('rm_data.pkl', 'rb') as inp:
    agent.memory = pickle.load(inp)

In [7]:
def play(model, epsilon, size, n_obstacles):
    print("runing play")
    game = Env()

    states = []
    rewards =  []
    actions = []

    state, done =  game.start(generate_map(size, n_obstacles))

    i = None
    for _ in count():

        states.append(state)

        # Select and perform an action
        action, i = model(state, i, epsilon)
        actions.append(action)
        actions.append(action)
        actions.append(action)

        next_state, reward, done = game.step(action)

        reward = torch.tensor([reward], device=config.device)
        rewards.append(reward)
        rewards.append(reward)
        rewards.append(reward)

        next_state = torch.tensor(next_state, device=config.device, dtype=torch.float)

        # Move to the next state
        state = next_state

        if done:
            states.append(state)
            del game
            return torch.cat(states), torch.cat(actions), torch.cat(rewards)


In [8]:
import torch.multiprocessing as mp


levels = [
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 2),
    (7, 3),
    (7, 5),
    (10, 7)
]

current_lvl = 0
for epoch in count():
    epsilon = config.epsilon(epoch)
    size = levels[current_lvl][0]
    n_obstacles = random.randint(0, levels[current_lvl][1])

    out = play(agent.select_action, epsilon, size, n_obstacles)


    log = {
        'loss': [],
        'reward': 0
    }

    log['reward'] += torch.sum(out[2]).cpu().data.item()
    agent.add_to_memory(*out)

    for _ in range(config.optimize_times):
        log['loss'].append(agent.optimize())

    log["loss"] = np.asarray(log["loss"])
    log["loss"] = log["loss"][log["loss"] != np.array(None)]
    train_log['ep'].append(epoch)
    train_log['loss'].append(log['loss'])
    train_log['reward'].append(log['reward'])
    train_log['size'].append(levels[current_lvl][0])
    train_log['n_obs'].append(levels[current_lvl][1])
    print("------------------------------------------------")
    print(f"Epoch {epoch} - Epsilon {epsilon} - Level {current_lvl}")
    print(f"Reward: {log['reward']}")
    print(f"Loss: {log['loss'].mean()}")# +/- {log['loss'].std()}")

    # Update the target network, copying all weights and biases in DQN
    if not epoch % config.update_rate:
        agent.update_target_net()

    if not epoch % config.save_rate:
        agent.save("net.pt")

    if log['reward'] >= 100:
        current_lvl += 1


runing play
[]


ValueError: Type must be a sub-type of ndarray type

In [None]:
# SAVE RM
import pickle

with open('rm_data.pkl', 'wb') as outp:
    pickle.dump(agent.memory, outp, pickle.HIGHEST_PROTOCOL)