# Test Parallel DQN

In [7]:
import torch
import torch.nn as nn
import numpy as np
import threading
from tqdm import tqdm
import time
import gym
from collections import deque

In [None]:
nb_episode = 1000
discount_factor = 0.99
learning_rate = 2e-4
test_frequency = 10
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.02
batch_size = 64
size_replay_buffer = 10000
update_frequency = 1

In [2]:
class TestAgent(threading.Thread) :

    speed_model = 1 # Vitesse
    condition = True

    def __init__(self,
                 nb_test : int,
                 q_network
                ):
        threading.Thread.__init__(self)
        self.nb_test = nb_test
        self.q_network = q_network
        self.list_rewards_mean = list()
        self.list_rewards_std = list()

    def run(self) :
        while(self.condition) :
            time.sleep(TestAgent.speed_model)
            list_rewards = list()
            for i in range(self.nb_test) :
                list_rewards.append(self.test())
            list_rewards = np.array(list_rewards)
            self.list_rewards_mean.append( list_rewards.mean() )
            self.list_rewards_std.append( list_rewards.std() )
            print(f"mean : {list_rewards.mean()}")
            print(f"std : {list_rewards.std()}")
            
            
    def test(self) : 
        timestepmax = 2000
        state = env.reset()
        done = False
        cum_sum = 0
        iteration = 0
        while not done and iteration < timestepmax:
            state_t = torch.as_tensor(state , dtype = torch.float32)
            action = torch.argmax(self.q_network(state_t)).item()
            new_state,reward,done,_ = env.step(action)
            state = new_state
            cum_sum += reward
            iteration += 1
        return cum_sum
                    
    def setCondition(self,condition : bool):
        self.condition = condition
        print("Arret de l'agent de test")

class CollectorAgent(threading.Thread) :

    speed_model = 0 # Vitesse
    condition = True

    def __init__(self,
                replay_buffer : list,
                q_network
                ):
        threading.Thread.__init__(self)
        self.replay_buffer = replay_buffer
        self.q_network = q_network
        
    def run(self) :
        while(self.condition) :
            time.sleep(CollectorAgent.speed_model)
            self.collect_trajectory()

            
            
            
    def collect_trajectory(self) :
        state = env.reset()
        done = False
        while not done :
            state_t = torch.as_tensor(state , dtype = torch.float32)
            if random.random() > epsilon :
                action = torch.argmax(q_network(state_t)).item()
            else :
                action = env.action_space.sample()
                
            new_state,reward,done,_ = env.step(action)
            transition = (state,action,done,reward,new_state)
            replay_buffer.append(transition)
            state = new_state
            
            
    def setCondition(self,condition : bool):
        self.condition = condition
        print("Arret de l'agent de collecte")
        
        
        
class LearnerAgent(threading.Thread) :

    speed_model = 0 # Vitesse
    condition = True

    def __init__(self,
                q_target_network,
                q_network,
                replay_buffer,
                optimizer
                ):
        threading.Thread.__init__(self)
        self.replay_buffer = replay_buffer
        self.q_network = q_network
        self.q_target_network = q_target_network
        self.optimizer = optimizer
        
    def run(self) :
        while(self.condition) :
            time.sleep(CollectorAgent.speed_model)
            self.collect_trajectory() 
            
            
            
    def learn(self) :
        
        if len(self.replay_buffer) >= batch_size :
            
            batch = random.sample(self.replay_buffer,batch_size)

            states = np.asarray([exp[0] for exp in batch],dtype=np.float32)
            actions = np.asarray([exp[1] for exp in batch],dtype=int)
            dones = np.asarray([exp[2] for exp in batch],dtype=int)
            rewards = np.asarray([exp[3] for exp in batch],dtype=np.float32)
            new_states = np.asarray([exp[4] for exp in batch],dtype=np.float32)
            
            states_t = torch.as_tensor(states , dtype=torch.float32)
            dones_t = torch.as_tensor(dones , dtype = torch.int64)
            new_states_t = torch.as_tensor(new_states , dtype=torch.float32)
            actions_t = torch.as_tensor(actions , dtype = torch.int64).unsqueeze(1)
            rewards_t = torch.as_tensor(rewards , dtype=torch.float32)
            
            y_target = rewards_t + discount_factor * (1 - dones_t) * torch.max(self.q_target_network(new_states_t),dim=1)[0].detach()

            mse = nn.MSELoss()

            loss = mse(torch.gather(self.q_network(states_t),dim=1,index=actions_t), y_target.unsqueeze(1))

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            for target_param, local_param in zip(self.q_target_network.parameters(), self.q_network.parameters()):
                target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)  
                
                
    def setCondition(self,condition : bool):
        self.condition = condition
        print("Arret de l'agent de learn")
        

In [5]:
l = list()

test_agent = TestAgent(l)
collector_agent = CollectorAgent(l)
learner_agent = LearnerAgent()

test_agent.start()
collector_agent.start()

time.sleep(20)

test_agent.setCondition(False)
collector_agent.setCondition(False)

[1]
[1, 1]
[1, 1, 1]
[1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
class QNetwork(nn.Module) :
    
    def __init__(self,
              nb_actions,
              nb_observations) : 
        
        super().__init__()
        self.nb_actions = nb_actions
        self.nb_observations = nb_observations
        
        self.net = nn.Sequential(
            nn.Linear(nb_observations, 125),
            nn.ReLU(),
            nn.Linear(125,100),
            nn.ReLU(),
            nn.Linear(100, nb_actions)
        )
        
    def forward(self,x) :
        return self.net(x)

In [None]:


nb_episode = 1000

discount_factor = 0.99
learning_rate = 2e-4
test_frequency = 10
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.02
batch_size = 64
size_replay_buffer = int(1e5)
update_frequency = 1

tau = 1e-3 



replay_buffer = deque(maxlen=size_replay_buffer)
q_network = QNetwork(nb_actions,nb_observations)
q_target_network = QNetwork(nb_actions,nb_observations)
q_target_network.load_state_dict(q_network.state_dict())
optimizer = torch.optim.Adam(q_network.parameters(), lr=learning_rate)

In [None]:
replay_buffer = deque(maxlen=size_replay_buffer)
q_network = QNetwork(nb_actions,nb_observations)

timestep = 0

optimizer = torch.optim.Adam(q_network.parameters(), lr=learning_rate)
list_tests_2 = []

average_list = deque(maxlen=100)
for episode in tqdm(range(nb_episode)) :
    state = env.reset()
    done = False
    cumul = 0
    epsilon = max(epsilon * epsilon_decay,epsilon_min)
    
    while not done : 
        state_t = torch.as_tensor(state , dtype = torch.float32)
        
        if random.random() > epsilon :
            action = torch.argmax(q_network(state_t)).item()
        else :
            action = env.action_space.sample()
            
        new_state,reward,done,_ = env.step(action)
        
        cumul += reward
        
        transition = (state,action,done,reward,new_state)
        replay_buffer.append(transition)
        
        if len(replay_buffer) >= batch_size and timestep % update_frequency == 0 :
        
            
            batch = random.sample(replay_buffer,batch_size)

            states = np.asarray([exp[0] for exp in batch],dtype=np.float32)
            actions = np.asarray([exp[1] for exp in batch],dtype=int)
            dones = np.asarray([exp[2] for exp in batch],dtype=int)
            rewards = np.asarray([exp[3] for exp in batch],dtype=np.float32)
            new_states = np.asarray([exp[4] for exp in batch],dtype=np.float32)

            
            states_t = torch.as_tensor(states , dtype=torch.float32)
            dones_t = torch.as_tensor(dones , dtype = torch.int64)
            new_states_t = torch.as_tensor(new_states , dtype=torch.float32)
            actions_t = torch.as_tensor(actions , dtype = torch.int64).unsqueeze(1)
            rewards_t = torch.as_tensor(rewards , dtype=torch.float32)
            
            y_target = rewards_t + discount_factor * (1 - dones_t) * torch.max(q_network(new_states_t),dim=1)[0]

            mse = nn.MSELoss()

            loss = mse(torch.gather(q_network(states_t),dim=1,index=actions_t), y_target.unsqueeze(1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        state = new_state
        timestep += 1
    
    average_list.append(cumul)
    if episode % test_frequency == 0 :
        t =  0
        for _ in range(10) :
            t += test(q_network)
        t /= 10
        avg = sum(average_list) / len(average_list)
        print(f"episode {episode} - test reward : {t} - avg : {avg} - epsilon {epsilon}")
        list_tests_2.append(t)