# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
import random

Dołączenie bibliotek ze środowiskami:

In [2]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLakeExtended
from env.FrozenLakeMDP import LEFT, RIGHT, UP, DOWN

Dołączenie bibliotek do obsługi sieci neuronowych

In [3]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

In [4]:
tf.config.list_physical_devices('GPU')

[]

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [5]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        self.learning_rate = learning_rate
        self.model = model

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        #     
        
        if random.random() < self.epsilon:
            return random.randint(0, self.action_size-1)
        else:
            return self.get_best_action(state)

  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        #
        # INSERT CODE HERE to get best possible action in a given state (remember to break ties randomly)
        #
        prediction = self.model.predict(np.array([state]), batch_size=1, verbose=0)[0]
        return np.argmax(prediction)

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """
        data = []
        labels = []
        
        entries = random.sample(self.memory, k=batch_size)
        
        states = [entry[0] for entry in entries]
        next_states = [entry[3] for entry in entries]
    
        merged = self.model.predict(np.array(states+next_states), batch_size=2*batch_size, verbose=0)
        outputs = merged[:batch_size]
        next_outputs = merged[batch_size:]
        
        data = []
        labels = []
        
        for (state, action, reward, next_state, done), output, next_output in zip(entries, outputs, next_outputs):
            if done:
                expected_reward = reward
            else:
                expected_reward = reward + self.gamma * np.max(next_output)
                
            expected_output = np.array(output)
            expected_output[action] = expected_reward
            
            data.append(state)
            labels.append(expected_output)
        
        data = np.array(data)
        labels = np.array(labels)
        
        self.model.fit(data, labels, verbose=0)
        
        

    def update_epsilon_value(self):
        #Every each epoch epsilon value should be updated according to equation: 
        #self.epsilon *= self.epsilon_decay, but the updated value shouldn't be lower then epsilon_min value
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [None]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = keras.models.Sequential([
    layers.InputLayer((state_size,)),
    layers.Dense(170, activation='relu'),
    layers.Dense(170, activation='relu'),
    layers.Dense(170, activation='relu'),
    layers.Dense(4 , activation=None)
])

model.compile(optimizer='adam', loss='mse')
model.summary()
        

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 170)               11050     
                                                                 
 dense_5 (Dense)             (None, 170)               29070     
                                                                 
 dense_6 (Dense)             (None, 170)               29070     
                                                                 
 dense_7 (Dense)             (None, 4)                 684       
                                                                 
Total params: 69,874
Trainable params: 69,874
Non-trainable params: 0
_________________________________________________________________


 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 10000
counter = 0
for e in range(EPISODES):

    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.zeros((state_size,))
        state[0] = 1
        
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.zeros((state_size,))
            next_state[next_state_env] = 1

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    
    agent.update_epsilon_value()

epoch #0	mean reward = 0.010	epsilon = 0.750
epoch #1	mean reward = 0.010	epsilon = 0.712
epoch #2	mean reward = 0.050	epsilon = 0.677
epoch #3	mean reward = 0.090	epsilon = 0.643
epoch #4	mean reward = 0.050	epsilon = 0.611
epoch #5	mean reward = 0.010	epsilon = 0.580
epoch #6	mean reward = 0.130	epsilon = 0.551
epoch #7	mean reward = 0.050	epsilon = 0.524
epoch #8	mean reward = 0.000	epsilon = 0.498
epoch #9	mean reward = 0.060	epsilon = 0.473
epoch #10	mean reward = 0.100	epsilon = 0.449
epoch #11	mean reward = 0.030	epsilon = 0.427
epoch #12	mean reward = 0.120	epsilon = 0.405
epoch #13	mean reward = 0.010	epsilon = 0.385
epoch #14	mean reward = 0.000	epsilon = 0.366
epoch #15	mean reward = 0.010	epsilon = 0.347
epoch #16	mean reward = 0.170	epsilon = 0.330
epoch #17	mean reward = 0.040	epsilon = 0.314
epoch #18	mean reward = 0.170	epsilon = 0.298
epoch #19	mean reward = 0.320	epsilon = 0.283
epoch #20	mean reward = 0.390	epsilon = 0.269
epoch #21	mean reward = 0.570	epsilon = 0.25

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [None]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

        #
        # INSERT CODE HERE to build network
        #
        
model = keras.models.Sequential([
    layers.InputLayer((4, 4, 3)),
    layers.Conv2D(16, (1, 1), padding='same'),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(4 , activation=None)
])

model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 4, 4, 16)          64        
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 4)                 260       
                                                                 
Total params: 41,476
Trainable params: 41,476
Non-trainable params: 0
_________________________________________________________________


In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([
            np.array(env.goal).reshape((4, 4)),
            np.array(env.holes).reshape((4, 4)),
            np.array(env.player).reshape((4, 4)),
        ]).transpose((1, 2, 0))
        
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            goal, holes, player = next_state_env
            next_state = np.array([
                np.array(goal).reshape((4, 4)),
                np.array(holes).reshape((4, 4)),
                np.array(player).reshape((4, 4)),
            ]).transpose((1, 2, 0))

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()

epoch #0	mean reward = 0.050	epsilon = 0.750
epoch #1	mean reward = 0.080	epsilon = 0.712
epoch #2	mean reward = 0.090	epsilon = 0.677
epoch #3	mean reward = 0.150	epsilon = 0.643
epoch #4	mean reward = 0.220	epsilon = 0.611
epoch #5	mean reward = 0.290	epsilon = 0.580
epoch #6	mean reward = 0.360	epsilon = 0.551
epoch #7	mean reward = 0.340	epsilon = 0.524
epoch #8	mean reward = 0.420	epsilon = 0.498
epoch #9	mean reward = 0.400	epsilon = 0.473
epoch #10	mean reward = 0.420	epsilon = 0.449
epoch #11	mean reward = 0.440	epsilon = 0.427
epoch #12	mean reward = 0.520	epsilon = 0.405
epoch #13	mean reward = 0.490	epsilon = 0.385
epoch #14	mean reward = 0.610	epsilon = 0.366
epoch #15	mean reward = 0.640	epsilon = 0.347
epoch #16	mean reward = 0.610	epsilon = 0.330
epoch #17	mean reward = 0.640	epsilon = 0.314
epoch #18	mean reward = 0.720	epsilon = 0.298
epoch #19	mean reward = 0.740	epsilon = 0.283
epoch #20	mean reward = 0.750	epsilon = 0.269
epoch #21	mean reward = 0.710	epsilon = 0.25

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [None]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = keras.models.Sequential([
    layers.InputLayer((state_size,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(action_size , activation=None)
])

model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                160       
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                                 
 dense_3 (Dense)             (None, 32)                1056      
                                                                 
 dense_4 (Dense)             (None, 32)                1056      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 4,450
Trainable params: 4,450
Non-trainabl

Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
        
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array(env_state)
        
        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array(next_state_env)

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()

epoch #0	mean reward = 17.910	epsilon = 0.750
epoch #1	mean reward = 17.350	epsilon = 0.712
epoch #2	mean reward = 21.810	epsilon = 0.677
epoch #3	mean reward = 40.540	epsilon = 0.643
epoch #4	mean reward = 56.780	epsilon = 0.611
epoch #5	mean reward = 81.980	epsilon = 0.580
epoch #6	mean reward = 85.350	epsilon = 0.551
epoch #7	mean reward = 128.960	epsilon = 0.524
epoch #8	mean reward = 116.170	epsilon = 0.498
epoch #9	mean reward = 158.750	epsilon = 0.473
epoch #10	mean reward = 179.090	epsilon = 0.449
epoch #11	mean reward = 186.300	epsilon = 0.427
You Win!
