In [1]:
import torch
import matplotlib
import matplotlib.pyplot as plt

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
from dataclasses import dataclass

@dataclass
class Config:
    # Torch Parameters
    device: torch.device
    
    # Enviroment Parameters
    n_states: int
    n_actions: int

    # Net Parameters
    hidden_size = 64
    hidden_layers = 1

    # Memory Parameters
    rm_size: int
    
    # Learning Hyperparameters
    batch_size: int
    epochs: int
    lr: float
    gamma: float
    update_rate: int
    epsilon_decay_begin: int
    epsilon_decay_end: int
    epsilon: callable
    save_rate: int
    num_workers: int
    optimize_times: int
    optimize_rate: int


In [3]:
from collections import deque, namedtuple
import random


Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))
Run = namedtuple("Run", ('states', 'actions', 'rewards'))


class ReplayMemory():

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Run(*args))

    def sample(self, batch_size):
        return random.sample(list(self.memory), batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
import torch.nn as nn


class DRQN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=128, hidden_layers=1):
        super(DRQN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, hidden_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
      
    def forward(self, x, i=None):
        if i is not None:
            x, i = self.lstm(x, i)
        else:
            x, i = self.lstm(x)
        return self.out(x), i


In [5]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import random


class DRQN_Agent:
    def __init__(self, net, config:Config, path=None):
        self.config = config

        self.target_net = net(self.config.n_states, self.config.n_actions).to(self.config.device)

        if path:
            self.q_net = torch.load(path).to(self.config.device)
        else:
            self.q_net = net(self.config.n_states, self.config.n_actions).to(self.config.device)

        self.target_net.load_state_dict(self.q_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.config.lr)
        self.memory = ReplayMemory(self.config.rm_size)

        self.criterion = nn.MSELoss()

    def update_target_net(self):
        self.target_net.load_state_dict(self.q_net.state_dict())

    def add_to_memory(self, *args):
        self.memory.push(*args)

    def select_action(self, state, i=None, epsilon=0):
        with torch.no_grad():
            x, i = self.q_net(state.view(-1, 8), i)

        if random.random() < epsilon:
            # Explore: take a random action
            return torch.tensor([random.randrange(self.config.n_actions)], device=self.config.device, dtype=torch.long), i
        else:
            # Exploit: select the highest Q value
            return x.max(1)[1][-1].view(1), i

    def Q(self, state, action):
        state = pad_sequence(state, batch_first=True)
        action = pad_sequence(action, batch_first=True)

        x, _= self.q_net(state)
        return x.gather(2, action.view(self.config.batch_size, -1, 1))

    def target(self, state, reward):
        state = pad_sequence(state, batch_first=True)
        reward = pad_sequence(reward, batch_first=True)

        x, _ = self.target_net(state)
        Q_target = x.max(2)[0].detach()

        
        return (Q_target * self.config.gamma) + reward

    def optimize(self):
        if len(self.memory) < self.config.batch_size:
            return

        batch = Run(*zip(*self.memory.sample(self.config.batch_size)))

        next_state_batch = (i[3:] for i in batch.states)
        state_batch = (i[:-3] for i in batch.states)
        action_batch = batch.actions
        reward_batch = batch.rewards

        self.optimizer.zero_grad()

        # state_batch.shape = (N x L x states)
        # action_batch.shape = (N x L x 1)
        y = self.Q(state_batch, action_batch).view(self.config.batch_size, -1)

        # next_state_batch.shape = (N x L x states)
        # reward_batch.shape = (N x L x 1)
        yl = self.target(next_state_batch, reward_batch).view(self.config.batch_size, -1)

        loss = self.criterion(y, yl)

        loss.backward()
        self.optimizer.step()

        return loss.cpu().data.item()

    def save(self, name):
        torch.save(self.q_net, name)


In [6]:
def run(action_selector, epsilon, config):
    print("starting")
    game = Game()
    game.start()
    state, _, done = game.observe()
    total_reward = 0
    
    state = torch.tensor(state, device=config.device, dtype=torch.float)
    i = None
    
    for t in count():
        print(t)
        states.append(state)
        # Select and perform an action
        
        action, i = action_selector(state, i, epsilon)
        print(action)
        actions.append(action)
        actions.append(action)
        actions.append(action)
        
        game.act(action)
        next_state, reward, done = game.observe()
        total_reward += reward
        
        reward = torch.tensor([reward], device=config.device)
        rewards.append(reward)
        rewards.append(reward)
        rewards.append(reward)
        
        next_state = torch.tensor(next_state, device=config.device, dtype=torch.float)

        # Move to the next state
        state = next_state

        if done:
            del(game)
            states.append(state)
            print("end game")
            return [torch.cat(states), torch.cat(actions), torch.cat(rewards), total_reward]
    

In [7]:
from itertools import count
import numpy as np
from game import Game

config = Config(
    device = device,
    n_states = 8,
    n_actions = 6,
    rm_size = 100,
    batch_size = 64,
    epochs = 3000,
    update_rate = 75,
    lr = 0.001,
    gamma = 0.8,
    epsilon = lambda x: 1-.001*(x),
    epsilon_decay_begin = 1000,
    epsilon_decay_end = 2000,
    save_rate = 100,
    optimize_rate = 2,
    num_workers = 5,
    optimize_times = 5
)

train_log = {
    'ep': [],
    'reward': [],
    'loss': []
}

agent = DRQN_Agent(DRQN, config, path="net.pt")

for epoch in range(config.epochs):
    game = Game()

    epsilon = config.epsilon(epoch)

    log = {
        'loss': [],
        'reward': 0
    }
    states = []
    rewards =  []
    actions = []

    game.start()
    state, _, done = game.observe()
    state = torch.tensor(state, device=config.device, dtype=torch.float)
    i = None
    for t in count():
        states.append(state)
        # Select and perform an action
        
        action, i = agent.select_action(state, i, epsilon)
        actions.append(action)
        actions.append(action)
        actions.append(action)
        
        game.act(action)
        next_state, reward, done = game.observe()
        log['reward'] += reward
        
        reward = torch.tensor([reward], device=config.device)
        rewards.append(reward)
        rewards.append(reward)
        rewards.append(reward)
        
        next_state = torch.tensor(next_state, device=config.device, dtype=torch.float)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        if not t % config.optimize_rate:
            log['loss'].append(agent.optimize())

        if done:
            states.append(state)
            agent.add_to_memory(torch.cat(states), torch.cat(actions), torch.cat(rewards))
            break
    del(game)

    log["loss"] = np.asarray(log["loss"])
    log["loss"] = log["loss"][log["loss"] != np.array(None)]
    train_log['ep'].append(epoch)
    train_log['loss'].append(log['loss'])
    train_log['reward'].append(log['reward'])
    print("------------------------------------------------")
    print(f"Epoch {epoch} - Epsilon {epsilon}")
    print(f"Reward: {log['reward']}")
    print(f"Loss: {log['loss'].mean()}")# +/- {log['loss'].std()}")

    # Update the target network, copying all weights and biases in DQN
    if not epoch % config.update_rate:
        agent.update_target_net()

    if not epoch % config.save_rate:
        agent.save("net.pt")


Morreu
------------------------------------------------
Epoch 0 - Epsilon 1.0
Reward: -50
Loss: nan


  print(f"Loss: {log['loss'].mean()}")# +/- {log['loss'].std()}")
  ret = ret / rcount


Morreu
------------------------------------------------
Epoch 1 - Epsilon 0.999
Reward: -50
Loss: nan
Morreu
------------------------------------------------
Epoch 2 - Epsilon 0.998
Reward: -50
Loss: nan
------------------------------------------------
Epoch 3 - Epsilon 0.997
Reward: 0
Loss: nan
Morreu
------------------------------------------------
Epoch 4 - Epsilon 0.996
Reward: -50
Loss: nan
------------------------------------------------
Epoch 5 - Epsilon 0.995
Reward: 0
Loss: nan
Morreu
------------------------------------------------
Epoch 6 - Epsilon 0.994
Reward: -50
Loss: nan
Morreu
------------------------------------------------
Epoch 7 - Epsilon 0.993
Reward: -50
Loss: nan
Morreu
------------------------------------------------
Epoch 8 - Epsilon 0.992
Reward: -50
Loss: nan
Morreu
------------------------------------------------
Epoch 9 - Epsilon 0.991
Reward: -50
Loss: nan
------------------------------------------------
Epoch 10 - Epsilon 0.99
Reward: 0
Loss: nan
-------


KeyboardInterrupt



In [None]:
from itertools import count
import numpy as np
from game import Game
from multiprocessing import Pool

def eps(x):
    return 1-.001*(x)

config = Config(
    device = device,
    n_states = 8,
    n_actions = 6,
    rm_size = 100,
    batch_size = 64,
    epochs = 3000,
    update_rate = 50,
    lr = 0.001,
    gamma = 0.8,
    epsilon = eps,
    epsilon_decay_begin = 1000,
    epsilon_decay_end = 2000,
    save_rate = 100,
    num_workers = 5,
    optimize_times = 5
)

train_log = {
    'ep': [],
    'reward': [],
    'loss': []
}

agent = DRQN_Agent(DRQN, config, path="net.pt")

for epoch in range(config.epochs):
    print("on for")
    epsilon = config.epsilon(epoch)
    print(epsilon)

    log = {
        'loss': [],
        'reward': 0
    }
    
    with Pool(config.num_workers) as pool:
        params = [[agent.select_action, epsilon, config] for i in range(config.num_workers)]
        print("p")
        results = [pool.apply_async(run, p) for p in params]
        print("r")
        
        for r in results:
            print('\t', r.get())

    print(results)
    results = [r.get() for r in results]
    print(results)

    for i in results:
        log['reward'] += i.pop(-1)
        log['reward'] /= config.num_workers
        print(['reward'])
        agent.add_to_memory(*i)

    for i in range(config.optimize_times):
        log['loss'].append(agent.optimize())

    log["loss"] = np.asarray(log["loss"])
    log["loss"] = log["loss"][log["loss"] != np.array(None)]
    train_log['ep'].append(epoch)
    train_log['loss'].append(log['loss'])
    train_log['reward'].append(log['reward'])
    print("------------------------------------------------")
    print(f"Epoch {epoch} - Epsilon {epsilon}")
    print(f"Reward: {log['reward']}")
    print(f"Loss: {log['loss'].mean()}")# +/- {log['loss'].std()}")

    # Update the target network, copying all weights and biases in DQN
    if not epoch % config.update_rate:
        agent.update_target_net()

    if not epoch % config.save_rate:
        agent.save("net.pt")