In [None]:
import torch
import matplotlib
import matplotlib.pyplot as plt

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from dataclasses import dataclass

@dataclass
class Config:
    # Torch Parameters
    device: torch.device
    
    # Enviroment Parameters
    n_states: int
    n_actions: int

    # Net Parameters
    input_size: int
    output_size: int
    hidden_size: int
    hidden_layers: int

    # Memory Parameters
    rm_size: int
    
    # Learning Hyperparameters
    batch_size: int
    epochs: int
    lr: float
    gamma: float
    update_rate: int
    


In [None]:
from collections import deque, namedtuple
import random


Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))
Run = namedtuple("Run", ('states', 'actions', 'rewards'))


class ReplayMemory():

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Run(*args))

    def sample(self, batch_size):
        return [i.sample for i in random.choices(self.memory, batch_size)]

    def __len__(self):
        return len(self.memory)

In [None]:
import torch.nn as nn


class DRQN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=128, hidden_layers=1):
        super(DRQN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, hidden_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
      
    def forward(self, x, h, c):
        x, h, c = self.lstm(x, h, c)
        return self.out(x), h, c
    

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import random


class DRQN_Agent:
    def __init__(self, net, config):
        self.config = config  
        self.args = {'lr': LR, 'rm_size':RM_SIZE, 'gamma':GAMMA, 'batch_size':BATCH_SIZE, 'device':device}

        self.q_net = net(self.config.n_states, self.config.n_actions).to(self.config.device)
        self.target_net = net(self.config.n_states, self.config.n_actions).to(self.config.device)

        self.target_net.load_state_dict(self.q_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.config.lr)
        self.memory = ReplayMemory(self.config.rm_size)

        self.criterion = nn.MSELoss()

    def update_target_net(self):
        self.target_net.load_state_dict(self.q_net.state_dict())

    def add_to_memory(self, *args):
        self.memory.push(*args)

    def select_action(self, state, epsilon=0):
        if random.random() < epsilon:
            # Explore: take a random action
            return torch.tensor([[random.randrange(self.config.n_actions)]], device=self.config.device, dtype=torch.long)
        
        else:
            # Exploit: select the highest Q value
            with torch.no_grad():
                return self.q_net(state).max(1)[1].view(-1,1)

    def Q(self, state, action):
        x, h, c = self.q_net(state)
        return x.gather(1, action)

    def target(self, state, reward):

        non_final_mask = torch.tensor([i is not None for i in state], device=self.args['device'], dtype=torch.bool)
        non_final_states = torch.cat([i for i in state if i is not None])
    
        # if the default reward for completing is not 0 this line must be changed
        Q_target = torch.zeros(len(state), device=self.args['device'])
        Q_target[non_final_mask] = self.target_net(non_final_states).max(1)[0].detach()
        Q_target = Q_target.view(-1, 1)
    
        return (Q_target * self.args['gamma']) + reward

    def optimize(self):
        # FIX for drql
        if len(self.memory) < self.args['batch_size']:
            return

        batch = Run(*zip(*self.memory.sample(self.args['batch_size'])))
        
        next_state_batch = (i[1:] for i in batch.states)
        state_batch = torch.cat(batch.states)
        action_batch = torch.cat(batch.actions)
        reward_batch = torch.cat(batch.rewards)
    
        self.optimizer.zero_grad()

        # state_batch.shape = (N x L x states)
        # action_batch.shape = (N x L x 1)
        y = self.Q(state_batch, action_batch)

        # next_state_batch.shape = (N x L x states)
        # reward_batch.shape = (N x L x 1)
        yl = self.target(next_state_batch, reward_batch)
    
        loss = self.criterion(y, yl)
        
        loss.backward()
        self.optimizer.step()
    
        return loss.cpu().data.item()

    def save(self, name):
        torch.save(self.q_net, name)


In [None]:
from receiver import Receiver

class Player:
    def __init__(self, port):
        self.env = Receiver(port)
        self.env.start()

    def play_one(self, Q):
        done = False
        while not done:
            self.env.observe

In [None]:
from itertools import count
import numpy as np

config = Config(
    device = device,
    n_states = 5,
    n_actions = 5,
    rm_size = 10000,
    batch_size = 100,
    epochs = 3000,
    update_rate = 100,
    lr = 0.001,
    gamma = 0.8
)

train_log = {
    'ep': [],
    'reward': [],
    'loss': []
}

agent = DRQN_Agent(DRQN, config)

for epoch in range(config.epochs):

  epsilon = get_epsilon(epoch)

  log = {
        'loss': [],
        'reward': 0
  }

  state = env.reset()
  state = torch.tensor([state], device=config.device) 
  for t in count():
      # Select and perform an action
      action = agent.select_action(state, epsilon)
      next_state, reward, done, _ = env.step(action.item())
      log['reward'] += reward
      reward = torch.tensor([reward], device=config.device)
      next_state = torch.tensor([next_state], device=config.device)

      if done:
        next_state = None

      # Store the transition in memory
      agent.add_to_memory(state, action, next_state, reward)

      # Move to the next state
      state = next_state

      # Perform one step of the optimization (on the policy network)
      log['loss'].append(agent.optimize())

      if done:
        break
         
  log["loss"] = np.asarray(log["loss"])
  log["loss"] = log["loss"][log["loss"] != np.array(None)]
  train_log['ep'].append(epoch)
  train_log['loss'].append(log['loss'])
  train_log['reward'].append(log['reward'])
  print("------------------------------------------------")
  print(f"Epoch {epoch} - Epsilon {epsilon}") 
  print(f"Reward: {log['reward']}")
  print(f"Loss: {log['loss'].mean()}")# +/- {log['loss'].std()}")

  # Update the target network, copying all weights and biases in DQN
  if not epoch % config.update_rate:
    agent.update_target_net()

env.close()