<a href="https://colab.research.google.com/github/cxbxmxcx/EvolutionaryDeepLearning/blob/main/EDL_11_1_FrozenLake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#@title Imports
import numpy as np
import gym
import random

In [None]:
#@title Create the Environment
env = gym.make("FrozenLake-v0")

In [None]:
#title Get Action/State sizes
action_size = env.action_space.n
state_size = env.observation_space.n
print(action_size, state_size)

4 16


In [None]:
# Build Q table
Q = np.zeros((state_size, action_size))
print(Q)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:
#@title Hyperparameters
total_episodes = 20000       # Total episodes
learning_rate = 0.7          # Learning rate
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [None]:
def choose_action(state):
  if random.uniform(0, 1) > epsilon:
    return np.argmax(Q[state,:])  
  else:
    return env.action_space.sample()

In [None]:
#@title Learn
def learn(reward, state, action, new_state):
  # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
  # qtable[new_state,:] : all the actions we can take from new state
  Q[state, action] = Q[state, action] + learning_rate * (reward + gamma * np.max(Q[new_state, :]) - Q[state, action])

In [None]:
#@title Train the Agent
# List of rewards
rewards = []
epsilon = 1.0

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):  
        action = choose_action(state)    

        #excute the action
        new_state, reward, done, info = env.step(action)

        learn(reward, state, action, new_state)
        
        total_rewards += reward        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)
    

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(Q)

Score over time: 0.4985
[[1.93032136e-01 8.87165050e-02 3.40922159e-02 8.70720778e-02]
 [1.48896402e-02 2.29885947e-02 7.60054976e-03 8.47165913e-02]
 [5.59352024e-02 1.01959480e-02 7.90658835e-03 6.46472538e-02]
 [5.31031324e-03 1.72618144e-03 4.36539506e-03 5.49684168e-02]
 [2.89124812e-01 8.58418603e-02 2.57275211e-02 1.09284859e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.79260048e-03 3.48226355e-04 3.58044111e-02 7.87560603e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.02886671e-01 2.03142159e-02 3.54334835e-02 3.73245360e-01]
 [1.66879406e-02 5.78982746e-01 2.27941089e-02 2.11243714e-02]
 [8.26637922e-01 8.73569735e-03 6.83932886e-03 1.61863538e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.27066414e-01 1.43113433e-01 8.27926107e-01 9.21551202e-02]
 [1.10208411e-01 9.83905614e-01 2.99359344e-01 2.56528839e-01]
 [0.00000000e+00 0.00000000e+00

In [None]:
#@title Run the agent
env.reset()

for episode in range(5):
    state = env.reset()
    steps = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            if new_state == 15:
                print("Goal reached 🏆")
            else:
                print("Aaaah ☠️")
            
            # We print the number of step it took.
            print("Number of steps", step)
            
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Goal reached 🏆
Number of steps 24
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Goal reached 🏆
Number of steps 31
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Goal reached 🏆
Number of steps 25
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Goal reached 🏆
Number of steps 92
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Goal reached 🏆
Number of steps 6
