# Laboratorium 5 (4 pkt)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
import random

Dołączenie bibliotek do obsługi sieci neuronowych

In [2]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

## Zadanie 1 - Double Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Double Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
       Q^*(s, a) \approx r + \gamma argmax_{a'}Q_\theta'(s', a') 
\end{equation}
a wagi pomiędzy sieciami wymieniane są co dziesięć aktualizacji wag sieci sterującej poczynaniami agenta ($Q$).
</p>

In [8]:
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.5  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_weights()
        self.replay_counter = 1

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        #        
        if random.random() < self.epsilon:
            return random.randint(0, self.action_size-1)
        else:
            return self.get_best_action(state)

  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        #
        # INSERT CODE HERE to get best possible action in a given state (remember to break ties randomly)
        #

        prediction = self.model.predict(np.array([state]), verbose=0)[0]
        max_value = np.max(prediction)
        max_indexes = [idx for idx, value in enumerate(prediction) if value == max_value]
        return random.choice(max_indexes)

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        After each 10 Q Network trainings parameters should be copied to the target Q Network
        """
        #
        # INSERT CODE HERE to train network
        #
        data = []
        labels = []
        
        entries = random.sample(self.memory, k=batch_size)
        states = [entry[0] for entry in entries]
        next_states = [entry[3] for entry in entries]
    
        merged = self.target_model.predict(np.array(states+next_states), batch_size=2*batch_size, verbose=0)
        outputs_target = merged[:batch_size]
        next_outputs_target = merged[batch_size:]
        next_outputs_model = self.model.predict(np.array(next_states), batch_size=batch_size, verbose=0)
        
        data = []
        labels = []
        
        for (state, action, reward, next_state, done), output_target, next_output_target, next_output_model in zip(entries, outputs_target, next_outputs_target, next_outputs_model):
            if done:
                expected_reward = reward
            else:
                index = np.argmax(next_output_target)
                expected_reward = reward + self.gamma * next_output_model[index]
                
            label = np.array(output_target)
            label[action] = expected_reward
            
            data.append(state)
            labels.append(label)
        
        data = np.array(data)
        labels = np.array(labels)
        
        self.target_model.fit(data, labels, verbose=0)
        
        self.replay_counter += 1
        if (self.replay_counter > 10):
            self.replay_counter = 0
            self.update_weights()

    def update_epsilon_value(self):
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

    def update_weights(self):
        """copy trained Q Network params to target Q Network"""
        #
        # INSERT CODE HERE to train network
        #
        self.model.set_weights(self.target_model.get_weights()) 
        
    def _build_model(self):
        model = keras.models.Sequential([
            layers.InputLayer((self.state_size,)),
            layers.Dense(32, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(self.action_size , activation=None)
        ])

        model.compile(optimizer='adam', loss='mse')
        
        return model

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

Czas nauczyć agenta gry w środowisku *CartPool*:

In [9]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

In [10]:
agent = DDQNAgent(state_size, action_size)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array(env_state)
        
        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array(next_state_env)

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    
    agent.update_epsilon_value()

epoch #0	mean reward = 16.870	epsilon = 0.750
epoch #1	mean reward = 27.580	epsilon = 0.712
epoch #2	mean reward = 52.290	epsilon = 0.677
epoch #3	mean reward = 72.000	epsilon = 0.643
epoch #4	mean reward = 82.670	epsilon = 0.611
epoch #5	mean reward = 88.560	epsilon = 0.580
epoch #6	mean reward = 109.070	epsilon = 0.551
epoch #7	mean reward = 134.130	epsilon = 0.524
epoch #8	mean reward = 113.210	epsilon = 0.498
epoch #9	mean reward = 144.700	epsilon = 0.473
epoch #10	mean reward = 207.410	epsilon = 0.449
You Win!
