# IAR : Mini-projet

Dans cette partie, nous allons implémenter le Double Dueling Deep Q Network sur l'environnement discret de LunarLander. 

Importation des librairies

In [3]:
import gym
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
from collections import deque
import time

Définition de l'environement

In [12]:
env = gym.make("LunarLander-v2")
nb_actions = 4
nb_observations = 8

Hyper paramètres

In [13]:
nb_episode = 1000
discount_factor = 0.99
learning_rate = 2e-3
test_frequency = 10
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.02
batch_size = 64
size_replay_buffer = int(1e5)
update_frequency = 1
tau = 1e-3 

In [14]:
def test(q_network) :
    
    state = env.reset()
    done = False
    cum_sum = 0
    while not done :
        state_t = torch.as_tensor(state , dtype = torch.float32).unsqueeze(0)
        action = torch.argmax(q_network(state_t)).item()
        new_state,reward,done,_ = env.step(action)
        state = new_state
        cum_sum += reward
        
    return cum_sum

Dueling network

In [15]:
class DuelingQNetwork(nn.Module) :
    
    def __init__(self,
              nb_actions,
              nb_observations) : 
        
        super().__init__()
        self.nb_actions = nb_actions
        self.nb_observations = nb_observations
        
        self.net = nn.Sequential(
            nn.Linear(nb_observations,64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64,32)
        )
        
        self.net_advantage = nn.Sequential(
            nn.ReLU(),
            nn.Linear(32,nb_actions)
        )
        
        self.net_state_value = nn.Sequential(
            nn.ReLU(),
            nn.Linear(32,1)
        )
        
    def advantage(self,x) :
        return self.net_advantage(self.net(x))
    
    def state_value(self,x) :
        return self.net_state_value(self.net(x))
    
    def forward(self,x) :
        return self.state_value(x) + self.advantage(x) - torch.mean(self.advantage(x),dim=1).unsqueeze(1)

Initialisation

In [18]:
replay_buffer = deque(maxlen=size_replay_buffer)
q_network = DuelingQNetwork(nb_actions,nb_observations)
q_target_network = DuelingQNetwork(nb_actions,nb_observations)
q_target_network.load_state_dict(q_network.state_dict())
optimizer = torch.optim.Adam(q_network.parameters(), lr=learning_rate)
list_tests = []
timestep = 0

bestModel = DuelingQNetwork(nb_actions,nb_observations)
bestModel.load_state_dict(q_network.state_dict())
bestvalue = -1e9

average_list = deque(maxlen=100)

Boucle d'apprentissage

In [19]:

for episode in tqdm(range(nb_episode)) :
    state = env.reset()
    done = False
    
    cumul = 0
    epsilon = max(epsilon * epsilon_decay,epsilon_min)
    
    while not done : 
        state_t = torch.as_tensor(state , dtype = torch.float32).unsqueeze(0)
        
        if random.random() > epsilon :
            action = torch.argmax(q_network(state_t)).item()
        else :
            action = env.action_space.sample()
            
        new_state,reward,done,_ = env.step(action)

        cumul += reward
        
        transition = (state,action,done,reward,new_state)
        replay_buffer.append(transition)
        
        if len(replay_buffer) >= batch_size and timestep % update_frequency == 0 :
        
            batch = random.sample(replay_buffer,batch_size)

            states = np.asarray([exp[0] for exp in batch],dtype=np.float32)
            actions = np.asarray([exp[1] for exp in batch],dtype=int)
            dones = np.asarray([exp[2] for exp in batch],dtype=int)
            rewards = np.asarray([exp[3] for exp in batch],dtype=np.float32)
            new_states = np.asarray([exp[4] for exp in batch],dtype=np.float32)
            
            states_t = torch.as_tensor(states , dtype=torch.float32)
            dones_t = torch.as_tensor(dones , dtype = torch.int64).unsqueeze(1)
            new_states_t = torch.as_tensor(new_states , dtype=torch.float32)
            actions_t = torch.as_tensor(actions , dtype = torch.int64).unsqueeze(1)
            rewards_t = torch.as_tensor(rewards , dtype=torch.float32).unsqueeze(1)
            
            
            y_target = rewards_t + discount_factor * (1 - dones_t) * torch.gather(q_target_network(new_states_t),dim=1,index=torch.argmax(q_network(new_states_t),dim=1).unsqueeze(1)).detach()

            mse = nn.MSELoss()

            loss = mse(torch.gather(q_network(states_t),dim=1,index=actions_t), y_target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            for target_param, local_param in zip(q_target_network.parameters(), q_network.parameters()):
                target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)  
                
        timestep += 1
        
        state = new_state
        
    
    average_list.append(cumul)
    
    if episode % test_frequency == 0 :
        t =  0
        for _ in range(10) :
            t += test(q_network)
        t /= 10
        if t > bestvalue :
            bestvalue = t
            bestModel.load_state_dict(q_network.state_dict())
        avg = sum(average_list) / len(average_list)
        print(f"episode {episode} - test reward : {t} - avg : {avg} - epsilon {epsilon}")
        list_tests.append(t)

  0%|                                          | 1/1000 [00:00<09:55,  1.68it/s]

episode 0 - test reward : -604.8086591419683 - avg : -425.0760512886622 - epsilon 0.8183201210226743


  1%|▍                                        | 11/1000 [00:09<21:07,  1.28s/it]

episode 10 - test reward : -313.0537783987375 - avg : -187.18711703714555 - epsilon 0.778312557068642


  2%|▊                                        | 21/1000 [00:22<35:05,  2.15s/it]

episode 20 - test reward : -366.65616540518823 - avg : -192.46175609026946 - epsilon 0.7402609576967045


  3%|█▏                                     | 31/1000 [00:47<1:26:57,  5.38s/it]

episode 30 - test reward : -296.83406273958974 - avg : -177.38085392426993 - epsilon 0.7040696960536299


  4%|█▋                                       | 41/1000 [00:59<30:56,  1.94s/it]

episode 40 - test reward : -155.4424517579527 - avg : -170.66586292585492 - epsilon 0.6696478204705644


  5%|██                                       | 51/1000 [01:14<41:36,  2.63s/it]

episode 50 - test reward : -149.4731197355277 - avg : -165.5022635312931 - epsilon 0.6369088258938781


  6%|██▍                                    | 61/1000 [01:34<1:03:55,  4.08s/it]

episode 60 - test reward : -179.70265045428374 - avg : -151.8385621766675 - epsilon 0.6057704364907278


  7%|██▊                                    | 71/1000 [02:03<1:44:25,  6.74s/it]

episode 70 - test reward : -140.05058016721318 - avg : -142.3065970075292 - epsilon 0.5761543988830038


  8%|███▏                                   | 81/1000 [02:30<1:32:54,  6.07s/it]

episode 80 - test reward : -96.85483654351125 - avg : -132.7211756471315 - epsilon 0.547986285490042


  9%|███▌                                   | 91/1000 [02:55<1:19:38,  5.26s/it]

episode 90 - test reward : -137.93626609422407 - avg : -127.689449734903 - epsilon 0.5211953074858876


 10%|███▊                                  | 101/1000 [03:44<2:20:47,  9.40s/it]

episode 100 - test reward : -154.24495777143144 - avg : -121.40244400437479 - epsilon 0.49571413690105054


 11%|████▏                                 | 111/1000 [04:49<2:37:15, 10.61s/it]

episode 110 - test reward : -168.24886912553535 - avg : -110.38462250371684 - epsilon 0.47147873742168567


 12%|████▌                                 | 121/1000 [05:19<1:33:19,  6.37s/it]

episode 120 - test reward : -135.3231338244661 - avg : -91.80862843324056 - epsilon 0.4484282034609769


 13%|█████                                   | 126/1000 [05:44<39:50,  2.74s/it]


KeyboardInterrupt: 

In [None]:
torch.save(bestModel.state_dict(),"best_model_dq3n_lunarlanderdiscret")

In [None]:
plt.figure()
plt.title("lunarlander - dq3n - rewards")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.plot(np.arange(0,nb_episode,test_frequency),list_tests)
plt.show()