# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [None]:
%pip install gym



In [None]:
from collections import deque
import gym
import numpy as np
import random
import math

Dołączenie bibliotek ze środowiskami:

In [None]:
from FrozenLakeMDP import frozenLake
from FrozenLakeMDPExtended import frozenLake as frozenLakeExtended


Dołączenie bibliotek do obsługi sieci neuronowych

In [None]:
from tensorflow import keras

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [None]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model, get_legal_actions):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        self.learning_rate = learning_rate
        self.model = model
        self.get_legal_actions = get_legal_actions

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        # 
        if self.get_legal_actions == None:
          possible_actions = [0,1]
        else:
          possible_actions = self.get_legal_actions(state)       
        # lst = [0,1]
        # choice = random.choices(lst, weights=(1-self.epsilon, self.epsilon))
        # chosen_action = random.choice(possible_actions) if choice == [1] else self.get_best_action(state)
        # return chosen_action

        return random.choice(possible_actions) if (np.random.random() <= self.epsilon) else self.get_best_action(state)

  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        #
        # INSERT CODE HERE to get best possible action in a given state (remember to break ties randomly)
        #
        return np.argmax(self.model.predict(state)[0])

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """
        #
        # INSERT CODE HERE to train network
        #
        
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)
        
        states_batch = np.array([sample[0][0] for sample in minibatch])
        
        target_batch = self.model.predict(states_batch)

        next_states_batch = np.array([sample[3][0] for sample in minibatch])
        next_states_target_batch = self.model.predict(next_states_batch, batch_size=batch_size)

        for batch_idx, (state, action, reward, next_state, done) in enumerate(minibatch):

            if done:
                target_batch[batch_idx][action] = reward
            else:
                target_batch[batch_idx][action] = reward + self.gamma * max(next_states_target_batch[batch_idx][:])


        self.model.fit(states_batch, target_batch, batch_size=batch_size, verbose=0)
        


    def update_epsilon_value(self):
        #Every each epoch epsilon value should be updated according to equation: 
        #self.epsilon *= self.epsilon_decay, but the updated value shouldn't be lower then epsilon_min value
        if self.epsilon >= self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        


Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [None]:
env = frozenLake("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001


model = keras.Sequential()
model.add(keras.layers.Dense(64, input_dim=state_size, activation="relu"))
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(action_size))
model.compile(loss="mse",
              optimizer=keras.optimizers.Adam(learning_rate=learning_rate))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1088      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 4)                 260       
                                                                 
Total params: 5,508
Trainable params: 5,508
Non-trainable params: 0
_________________________________________________________________
None


 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model, get_legal_actions=env.get_possible_actions)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 100
counter = 0
for e in range(EPISODES):
    
    summary = []
    for i in range(100):
        
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([keras.utils.to_categorical(env_state, num_classes=state_size)])


        for time in range(1000):
            
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([keras.utils.to_categorical(next_state_env, num_classes=state_size)])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        summary.append(total_reward)
    agent.update_epsilon_value()

    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

epoch #0	mean reward = 0.000	epsilon = 0.712
epoch #1	mean reward = 0.060	epsilon = 0.677
epoch #2	mean reward = 0.070	epsilon = 0.643
epoch #3	mean reward = 0.110	epsilon = 0.611
epoch #4	mean reward = 0.140	epsilon = 0.580
epoch #5	mean reward = 0.150	epsilon = 0.551
epoch #6	mean reward = 0.060	epsilon = 0.524
epoch #7	mean reward = 0.150	epsilon = 0.498
epoch #8	mean reward = 0.190	epsilon = 0.473
epoch #9	mean reward = 0.200	epsilon = 0.449
epoch #10	mean reward = 0.270	epsilon = 0.427
epoch #11	mean reward = 0.240	epsilon = 0.405
epoch #12	mean reward = 0.410	epsilon = 0.385
epoch #13	mean reward = 0.430	epsilon = 0.366
epoch #14	mean reward = 0.220	epsilon = 0.347
epoch #15	mean reward = 0.430	epsilon = 0.330
epoch #16	mean reward = 0.450	epsilon = 0.314
epoch #17	mean reward = 0.510	epsilon = 0.298
epoch #18	mean reward = 0.600	epsilon = 0.283
epoch #19	mean reward = 0.430	epsilon = 0.269
epoch #20	mean reward = 0.440	epsilon = 0.255
epoch #21	mean reward = 0.570	epsilon = 0.24

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [None]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()*3
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = keras.Sequential()
model.add(keras.layers.Dense(64, input_dim=state_size, activation="relu"))
model.add(keras.layers.Dense(64*2, activation="relu"))
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(action_size))  # wyjście
model.compile(loss="mean_squared_error",
              optimizer=keras.optimizers.Adam(lr=learning_rate))

  super(Adam, self).__init__(name, **kwargs)


 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [None]:
agent = DQNAgent(action_size, learning_rate, model, get_legal_actions=env.get_possible_actions)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 100
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])
        
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        summary.append(total_reward)
    agent.update_epsilon_value()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    

epoch #0	mean reward = 0.010	epsilon = 0.712
epoch #1	mean reward = 0.040	epsilon = 0.677
epoch #2	mean reward = 0.140	epsilon = 0.643
epoch #3	mean reward = 0.220	epsilon = 0.611
epoch #4	mean reward = 0.080	epsilon = 0.580
epoch #5	mean reward = 0.160	epsilon = 0.551
epoch #6	mean reward = 0.180	epsilon = 0.524
epoch #7	mean reward = 0.230	epsilon = 0.498
epoch #8	mean reward = 0.340	epsilon = 0.473
epoch #9	mean reward = 0.280	epsilon = 0.449
epoch #10	mean reward = 0.280	epsilon = 0.427
epoch #11	mean reward = 0.250	epsilon = 0.405
epoch #12	mean reward = 0.330	epsilon = 0.385
epoch #13	mean reward = 0.400	epsilon = 0.366
epoch #14	mean reward = 0.470	epsilon = 0.347
epoch #15	mean reward = 0.530	epsilon = 0.330
epoch #16	mean reward = 0.570	epsilon = 0.314
epoch #17	mean reward = 0.660	epsilon = 0.298
epoch #18	mean reward = 0.570	epsilon = 0.283
epoch #19	mean reward = 0.670	epsilon = 0.269
epoch #20	mean reward = 0.690	epsilon = 0.255
epoch #21	mean reward = 0.680	epsilon = 0.24

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [None]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = keras.Sequential()
model.add(keras.layers.Dense(64, input_dim=state_size, activation='relu'))
model.add(keras.layers.Dense(64*2, activation='relu'))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(action_size, activation='linear'))
model.compile(loss='mse',
              optimizer=keras.optimizers.Adam(learning_rate=learning_rate))

Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model,get_legal_actions=None)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 100
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.reshape(env_state, [1, state_size])
        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.reshape(next_state_env, [1, state_size])
            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
          

        summary.append(total_reward)
    agent.update_epsilon_value()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    

epoch #0	mean reward = 18.120	epsilon = 0.712
epoch #1	mean reward = 16.790	epsilon = 0.677
epoch #2	mean reward = 22.380	epsilon = 0.643
epoch #3	mean reward = 48.120	epsilon = 0.611
epoch #4	mean reward = 41.970	epsilon = 0.580
epoch #5	mean reward = 51.560	epsilon = 0.551
epoch #6	mean reward = 53.150	epsilon = 0.524
epoch #7	mean reward = 81.570	epsilon = 0.498
epoch #8	mean reward = 101.760	epsilon = 0.473
epoch #9	mean reward = 125.880	epsilon = 0.449
epoch #10	mean reward = 167.010	epsilon = 0.427
epoch #11	mean reward = 131.240	epsilon = 0.405
epoch #12	mean reward = 158.030	epsilon = 0.385
epoch #13	mean reward = 156.640	epsilon = 0.366
epoch #14	mean reward = 229.650	epsilon = 0.347
You Win!
