# Laboratorium 5 (4 pkt)

Celem czwartego laboratorium jest zapoznanie siÄ oraz zaimplementowanie algorytmĂłw gĹÄbokiego uczenia aktywnego. Zaimplementowane algorytmy bÄdÄ testowane z wykorzystaniem Ĺrodowiska z OpenAI - _CartPole_.


DoĹÄczenie standardowych bibliotek


In [13]:
from collections import deque
import gym
import numpy as np
import random

DoĹÄczenie bibliotek do obsĹugi sieci neuronowych


In [14]:
import torch
import torch.nn as nn
from torch.optim import Adam

## Zadanie 1 - Double Deep Q-Network

<p style='text-align: justify;'>
Celem Äwiczenie jest zaimplementowanie algorytmu Double Deep Q-Network. WartosciÄ oczekiwanÄ sieci jest:
\begin{equation}
       Q^*(s, a) \approx r + \gamma argmax_{a'}Q_\theta'(s', a') 
\end{equation}
a wagi pomiÄdzy sieciami wymieniane sÄ co dziesiÄÄ aktualizacji wag sieci sterujÄcej poczynaniami agenta ($Q$).
</p>


In [15]:
class DDQNAgent:
    def __init__(self, action_size, learning_rate, model, target_model):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.5  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.85
        self.model = model
        self.target_model = target_model
        self.update_weights()
        self.replay_counter = 1
        self.loss_fn = nn.MSELoss()
        self.optimizer = Adam(self.model.parameters(), learning_rate)

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        #        
        r = random.random()
        if r < self.epsilon:
            return np.random.choice(self.action_size)

        return self.get_best_action(state)
        
  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        #
        # INSERT CODE HERE to get best possible action in a given state (remember to break ties randomly)
        #

        logits = self.model(torch.tensor(state, dtype=torch.float32)).tolist()

        best_actions = []
        score = float('-inf')

        for i, logit in enumerate(logits):
            if logit > score:
                score = logit
                best_actions = [i]
            elif logit == score:
                best_actions.append(i)

        return random.choice(best_actions)
    

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        After each 10 Q Network trainings parameters should be copied to the target Q Network
        """
        #
        # INSERT CODE HERE to train network
        #

        if self.replay_counter % 10 == 0:
            self.update_weights()

        batch = random.choices(self.memory, k=batch_size)
        self.model.train(True)
        self.optimizer.zero_grad()

        states, actions, rewards, next_states, dones = zip(*batch)

        states_tensor = torch.tensor(states, dtype=torch.float32)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
        next_states_tensor = torch.tensor(next_states, dtype=torch.float32)
        dones_tensor = torch.tensor(dones, dtype=torch.float32)

        target_actions = torch.argmax(self.target_model(next_states_tensor), dim=-1).tolist()

        logits = self.model(states_tensor)
        next_state_outputs = self.model(next_states_tensor)
        targets = torch.clone(logits)

        targets[list(range(len(batch))), actions] = rewards_tensor + self.gamma*next_state_outputs[list(range(len(batch))), target_actions]*(1-dones_tensor)
        loss = self.loss_fn(logits, targets)

        loss.backward()
        self.optimizer.step()

        self.replay_counter += 1

    def update_epsilon_value(self):
        #Every each epoch epsilon value should be updated according to equation: 
        #self.epsilon *= self.epsilon_decay, but the updated value shouldn't be lower then epsilon_min value
        if self.epsilon*self.epsilon_decay < self.epsilon_min:
            self.epsilon = self.epsilon_min
        else:
            self.epsilon *= self.epsilon_decay

    def update_weights(self):
        """copy trained Q Network params to target Q Network"""
        #
        # INSERT CODE HERE to train network
        #
        with torch.no_grad():
            for param1, param2 in zip(self.model.parameters(), self.target_model.parameters()):
                param2.copy_(param1)


Czas przygotowaÄ model sieci, ktĂłra bÄdzie siÄ uczyĹa dziaĹania w Ĺrodowisku [_CartPool_](https://gym.openai.com/envs/CartPole-v0/):


In [16]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001


class Model(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.lin1 = nn.Linear(state_size, 32)
        self.out = nn.Linear(32, action_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.lin1(x))
        return self.out(x)

Czas nauczyÄ agenta gry w Ĺrodowisku _CartPool_:


In [17]:
agent = DDQNAgent(action_size, learning_rate, Model(state_size, action_size), Model(state_size, action_size))

agent.epsilon = 0.4

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #

        state = env_state[0]
        
        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = next_state_env

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break

    agent.update_epsilon_value()


epoch #0	mean reward = 19.980	epsilon = 0.400
epoch #1	mean reward = 12.230	epsilon = 0.340
epoch #2	mean reward = 10.730	epsilon = 0.289
epoch #3	mean reward = 10.710	epsilon = 0.246
epoch #4	mean reward = 10.590	epsilon = 0.209
epoch #5	mean reward = 10.130	epsilon = 0.177
epoch #6	mean reward = 10.150	epsilon = 0.151
epoch #7	mean reward = 10.320	epsilon = 0.128
epoch #8	mean reward = 9.900	epsilon = 0.109
epoch #9	mean reward = 9.870	epsilon = 0.093
epoch #10	mean reward = 9.850	epsilon = 0.079
epoch #11	mean reward = 11.600	epsilon = 0.067
epoch #12	mean reward = 9.730	epsilon = 0.057
epoch #13	mean reward = 9.590	epsilon = 0.048
epoch #14	mean reward = 9.470	epsilon = 0.041
epoch #15	mean reward = 9.440	epsilon = 0.035
epoch #16	mean reward = 9.530	epsilon = 0.030
epoch #17	mean reward = 9.380	epsilon = 0.025
epoch #18	mean reward = 9.440	epsilon = 0.021
epoch #19	mean reward = 9.820	epsilon = 0.018
epoch #20	mean reward = 10.780	epsilon = 0.016
epoch #21	mean reward = 12.710	eps