In [4]:
# Laurent LEQUIEVRE
# Research Engineer, CNRS (France)
# Institut Pascal UMR6602
# laurent.lequievre@uca.fr

# https://unnatsingh.medium.com/deep-q-network-with-pytorch-d1ca6f40bfda

# https://github.com/markusbuchholz/deep-reinforcement-learning/blob/master/dqn/solution/dqn_agent.py

In [16]:
import numpy as np
import random 
from collections import namedtuple, deque 
import torch
import torch.nn.functional as F
import torch.optim as optim

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device = {}".format(device))

device = cuda:0


In [12]:
class ReplayBuffer:
    """Fixed -size buffe to store experience tuples."""
    
    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.
        
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experiences = namedtuple("Experience", field_names=["state",
                                                               "action",
                                                               "reward",
                                                               "next_state",
                                                               "done"])
        self.seed = random.seed(seed)
        
    def add(self,state, action, reward, next_state,done):
        """Add a new experience to memory."""
        e = self.experiences(state,action,reward,next_state,done)
        self.memory.append(e)
        
    def sample(self):
        """Randomly sample a batch of experiences from memory"""
        experiences = random.sample(self.memory,k=self.batch_size)
        
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        
        return (states,actions,rewards,next_states,dones)
    
    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [13]:
# https://www.geeksforgeeks.org/namedtuple-in-python/
# Declaring namedtuple()  
Experience = namedtuple("Experience", field_names=["state",
                                                               "action",
                                                               "reward",
                                                               "next_state",
                                                               "done"])

# Adding values  
E = Experience(2,3,-1,5,False)  
      
# Access using index  
print ("The state of E is {}".format(E.state))
print ("The action of E is {}".format(E.action))
print ("The reward of E is {}".format(E.reward))
print ("The next state of E is {}".format(E.next_state))
print ("The done of E is {}".format(E.done))


The state of E is 2
The action of E is 3
The reward of E is -1
The next state of E is 5
The done of E is False


In [14]:
# numpy vstack : Stack arrays in sequence vertically (row wise).
# https://scipython.com/book/chapter-6-numpy/examples/vstack-and-hstack/
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.vstack((a,b))

array([[1, 2, 3],
       [4, 5, 6]])

In [22]:
SEED = 30
ACTION_SIZE = 4
BUFFER_SIZE = 1000
BATCH_SIZE = 5

buffer = ReplayBuffer(ACTION_SIZE, BUFFER_SIZE, BATCH_SIZE, SEED)

states = [1, 2, 3, 4, 5, 6, 7, 8 , 9 , 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
actions = [1, 2, 3, 4, 5, 6, 7, 8 , 9 , 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
rewards = [1, 2, 3, 4, 5, 6, 7, 8 , 9 , 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
next_states = [1, 2, 3, 4, 5, 6, 7, 8 , 9 , 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
dones = [False, False, False, False, False, False, False, False, False, False,False, False, False, False, False,False, False, False, False, False]
for i in range(20):
    buffer.add(states[i], actions[i], rewards[i], next_states[i], dones[i])

In [24]:
s, a, r, ns, d = buffer.sample()
print(s)
print(a)
print(r)
print(ns)
print(d)

tensor([[ 2.],
        [13.],
        [19.],
        [ 5.],
        [ 3.]], device='cuda:0')
tensor([[ 2],
        [13],
        [19],
        [ 5],
        [ 3]], device='cuda:0')
tensor([[ 2.],
        [13.],
        [19.],
        [ 5.],
        [ 3.]], device='cuda:0')
tensor([[ 2.],
        [13.],
        [19.],
        [ 5.],
        [ 3.]], device='cuda:0')
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]], device='cuda:0')
