# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie siÄ oraz zaimplementowanie algorytmĂłw gĹÄbokiego uczenia aktywnego. Zaimplementowane algorytmy bÄdÄ testowane z wykorzystaniem wczeĹniej przygotowanych Ĺrodowisk: _FrozenLake_ i _Pacman_ oraz Ĺrodowiska z OpenAI - _CartPole_.


DoĹÄczenie standardowych bibliotek


In [12]:
from collections import deque
import gym
import numpy as np
import random

DoĹÄczenie bibliotek ze Ĺrodowiskami:


In [13]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLake as frozenLakeExtended

DoĹÄczenie bibliotek do obsĹugi sieci neuronowych


In [14]:
import torch
import torch.nn as nn
from torch.optim import Adam
import random

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem Äwiczenie jest zaimplementowanie algorytmu Deep Q-Network. WartosciÄ oczekiwanÄ sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>


In [15]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model: nn.Sequential):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.90    # discount rate
        self.epsilon = 0.8  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9
        self.learning_rate = learning_rate
        self.model = model
        self.loss_fn = nn.MSELoss()
        self.optimizer = Adam(self.model.parameters(), self.learning_rate)

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        r = random.random()
        if r < self.epsilon:
            return np.random.choice(self.action_size)

        return self.get_best_action(state)
    
  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        # self.model.eval()
        # with torch.no_grad():
        logits = self.model(torch.tensor(state, dtype=torch.float32)).tolist()

        best_actions = []
        score = float('-inf')

        for i, logit in enumerate(logits):
            if logit > score:
                score = logit
                best_actions = [i]
            elif logit == score:
                best_actions.append(i)

        return random.choice(best_actions)

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """

        batch = random.choices(self.memory, k=batch_size)
        self.model.train(True)
        self.optimizer.zero_grad()

        states, actions, rewards, next_states, dones = zip(*batch)

        states_tensor = torch.tensor(states, dtype=torch.float32)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
        next_states_tensor = torch.tensor(next_states, dtype=torch.float32)
        dones_tensor = torch.tensor(dones, dtype=torch.float32)

        logits = self.model(states_tensor)
        targets = torch.clone(logits)
        next_state_outputs = self.model(next_states_tensor)
        targets[list(range(len(batch))), actions] = rewards_tensor + self.gamma*torch.amax(next_state_outputs, dim=-1)*(1-dones_tensor)
        loss = self.loss_fn(logits, targets)

        loss.backward()
        self.optimizer.step()


    def update_epsilon_value(self):
        #Every each epoch epsilon value should be updated according to equation: 
        #self.epsilon *= self.epsilon_decay, but the updated value shouldn't be lower then epsilon_min value
        if self.epsilon*self.epsilon_decay < self.epsilon_min:
            self.epsilon = self.epsilon_min
        else:
            self.epsilon *= self.epsilon_decay


In [16]:
v = torch.rand((1, 3, 4))
torch.amax(v, dim=-1), v

(tensor([[0.6097, 0.9591, 0.9976]]),
 tensor([[[0.5496, 0.1246, 0.4669, 0.6097],
          [0.2861, 0.9591, 0.6412, 0.7501],
          [0.9976, 0.7120, 0.4607, 0.7388]]]))

Czas przygotowaÄ model sieci, ktĂłra bÄdzie siÄ uczyĹa poruszania po Ĺrodowisku _FrozenLake_, warstwa wejĹciowa powinna mieÄ tyle neuronĂłw ile jest moĹźlliwych stanĂłw, warstwa wyjĹciowa tyle neuronĂłw ile jest moĹźliwych akcji do wykonania:


In [17]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 1e-3

torch.random.manual_seed(42)
model = nn.Sequential(
    nn.Linear(state_size, 64),
    nn.ReLU(),
    nn.Linear(64, action_size),
)

Czas nauczyÄ agenta poruszania siÄ po Ĺrodowisku _FrozenLake_, jako stan przyjmij wektor o liczbie elementĂłw rĂłwnej liczbie moĹźliwych stanĂłw, z wartoĹciÄ 1 ustawionÄ w komĂłrce o indeksie rĂłwnym aktualnemu stanowi, pozostaĹe elementy majÄ byÄ wypeĹnione zerami:

- 1 pkt < 35 epok,
- 0.5 pkt < 60 epok,
- 0.25 pkt - w pozostaĹych przypadkach.


In [18]:
from copy import copy

agent = DQNAgent(action_size, learning_rate, model)
agent.epsilon = 0.55
done = False
batch_size = 128
EPISODES = 10000
counter = 0
for e in range(EPISODES):

    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #

        state = [0]*state_size
        state[env_state] = 1.

        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = [0]*state_size
            next_state[next_state_env] = 1.

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = copy(next_state)
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
    
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    
    agent.update_epsilon_value()

epoch #0	mean reward = 0.000	epsilon = 0.550
epoch #1	mean reward = 0.000	epsilon = 0.495
epoch #2	mean reward = 0.000	epsilon = 0.446
epoch #3	mean reward = 0.000	epsilon = 0.401
epoch #4	mean reward = 0.000	epsilon = 0.361
epoch #5	mean reward = 0.000	epsilon = 0.325
epoch #6	mean reward = 0.000	epsilon = 0.292
epoch #7	mean reward = 0.000	epsilon = 0.263
epoch #8	mean reward = 0.000	epsilon = 0.237
epoch #9	mean reward = 0.000	epsilon = 0.213
epoch #10	mean reward = 0.000	epsilon = 0.192
epoch #11	mean reward = 0.010	epsilon = 0.173
epoch #12	mean reward = 0.010	epsilon = 0.155
epoch #13	mean reward = 0.030	epsilon = 0.140
epoch #14	mean reward = 0.250	epsilon = 0.126
epoch #15	mean reward = 0.760	epsilon = 0.113
epoch #16	mean reward = 0.870	epsilon = 0.102
epoch #17	mean reward = 0.900	epsilon = 0.092
epoch #18	mean reward = 0.890	epsilon = 0.083
epoch #19	mean reward = 0.920	epsilon = 0.074
You Win!


Czas przygotowaÄ model sieci, ktĂłra bÄdzie siÄ uczyĹa poruszania po Ĺrodowisku _FrozenLakeExtended_, tym razem stan nie jest okreĹlany poprzez pojedynczÄ liczbÄ, a przez 3 tablice:

- pierwsza zawierajÄca informacje o celu,
- druga zawierajÄca informacje o dziurach,
- trzecia zawierajÄca informacjÄ o poĹoĹźeniu gracza.


In [19]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

class Model(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=16):
        super().__init__()
        self.lin_in1 = nn.Linear(input_dim, hidden_dim)
        self.lin_in2 = nn.Linear(input_dim, hidden_dim)
        self.lin_in3 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()

        self.lin = nn.Linear(hidden_dim*3, hidden_dim)
        self.out_lin = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        if len(x.shape) > 2:
            x1 = self.relu(self.lin_in1(x[:, -3]))
            x2 = self.relu(self.lin_in1(x[:, -2]))
            x3 = self.relu(self.lin_in1(x[:, -1]))
        else:
            x1 = self.relu(self.lin_in1(x[-3]))
            x2 = self.relu(self.lin_in1(x[-2]))
            x3 = self.relu(self.lin_in1(x[-1]))

        x = torch.concat([x1, x2, x3], dim=-1)
        x = self.relu(self.lin(x))
        return self.out_lin(x)
    
model = Model(state_size, action_size)

Czas nauczyÄ agenta poruszania siÄ po Ĺrodowisku _FrozenLakeExtended_, jako stan przyjmij wektor skĹadajÄcy siÄ ze wszystkich trzech tablic (2 pkt.):


In [20]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()

        state = env_state
                
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #

            next_state = next_state_env

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state.copy()
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()

epoch #0	mean reward = 0.020	epsilon = 0.750
epoch #1	mean reward = 0.020	epsilon = 0.675
epoch #2	mean reward = 0.020	epsilon = 0.608
epoch #3	mean reward = 0.060	epsilon = 0.547
epoch #4	mean reward = 0.120	epsilon = 0.492
epoch #5	mean reward = 0.350	epsilon = 0.443
epoch #6	mean reward = 0.510	epsilon = 0.399
epoch #7	mean reward = 0.590	epsilon = 0.359
epoch #8	mean reward = 0.670	epsilon = 0.323
epoch #9	mean reward = 0.640	epsilon = 0.291
epoch #10	mean reward = 0.720	epsilon = 0.262
epoch #11	mean reward = 0.640	epsilon = 0.235
epoch #12	mean reward = 0.770	epsilon = 0.212
epoch #13	mean reward = 0.750	epsilon = 0.191
epoch #14	mean reward = 0.830	epsilon = 0.172
You Win!


Czas przygotowaÄ model sieci, ktĂłra bÄdzie siÄ uczyĹa dziaĹania w Ĺrodowisku [_CartPool_](https://gym.openai.com/envs/CartPole-v0/):


In [21]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = nn.Sequential(
    nn.Linear(state_size, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, action_size)
)

# model = nn.Sequential(
#     nn.Linear(state_size, 16),
#     nn.ReLU(),
#     nn.Linear(16, action_size)
# )

Czas nauczyÄ agenta gry w Ĺrodowisku _CartPool_:

- 1 pkt < 10 epok,
- 0.5 pkt < 20 epok,
- 0.25 pkt - w pozostaĹych przypadkach.


In [22]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.4
agent.epsilon_decay = 0.65

done = False
batch_size = 256
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()

        state = env_state[0]

        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = next_state_env
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state.copy()
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()

epoch #0	mean reward = 13.280	epsilon = 0.400
epoch #1	mean reward = 15.250	epsilon = 0.260
epoch #2	mean reward = 24.540	epsilon = 0.169
epoch #3	mean reward = 44.610	epsilon = 0.110
epoch #4	mean reward = 57.110	epsilon = 0.071
epoch #5	mean reward = 151.640	epsilon = 0.046
epoch #6	mean reward = 142.120	epsilon = 0.030
epoch #7	mean reward = 177.610	epsilon = 0.020
You Win!
