In [4]:
# Q-learning on FrozenLake

import numpy as np
import gymnasium as gym   # RL environments (like FrozenLake)
import random


env = gym.make("FrozenLake-v1", is_slippery=False)
# "is_slippery=False" = no random slips

n_states = env.observation_space.n   # number of possible states (tiles in the lake)
n_actions = env.action_space.n       # number of possible actions (left, down, right, up)

#  creating the Q-table
Q = np.zeros((n_states, n_actions))
# big table filled with 0s
# rows = states (where the agent is)
# columns = actions (what move the agent can do)
# each cell will store how good it is to do this action in this state

# setting the learning parameters
alpha = 0.8   # learning rate: how fast we update knowledge
gamma = 0.9   # discount factor: how much we care about the future
epsilon = 0.1 # exploration rate: chance to pick random action
episodes = 50000  # how many games (episodes() the agent will play

# to track performance
rewards_per_episode = []

for ep in range(episodes):
    state, _ = env.reset()  # start new episode at beginning
    total_reward = 0
    done = False

    while not done:  # keep moving until goal or hole
        #choose action (epsilon-greedy)
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()  # explore: random move
        else:
            action = np.argmax(Q[state])        # exploit: best known move

        # take action in environment
        next_state, reward, done, truncated, info = env.step(action) #gives us the state we ended up in after taking the action, the reward, wether the game ended

        # update Q-table
        old_value = Q[state, action]  # what we thought before
        next_max = np.max(Q[next_state])  # best value in next state
        new_value = old_value + alpha * (reward + gamma * next_max - old_value)
        Q[state, action] = new_value

        # move to the next state
        state = next_state
        total_reward += reward

    rewards_per_episode.append(total_reward)


print("Training finished")
print("Final Q-table:")
print(Q)

# Average reward over 100 episodes
print("Average reward:", np.mean(rewards_per_episode[-100:]))


Training finished
Final Q-table:
[[0.531441   0.59049    0.4782969  0.531441  ]
 [0.531441   0.         0.43046721 0.4782969 ]
 [0.4782969  0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.59049    0.6561     0.         0.531441  ]
 [0.         0.         0.         0.        ]
 [0.         0.81       0.         0.43023689]
 [0.         0.         0.         0.        ]
 [0.6561     0.         0.729      0.59049   ]
 [0.6561     0.81       0.81       0.        ]
 [0.729      0.9        0.         0.729     ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.81       0.9        0.729     ]
 [0.81       0.9        1.         0.81      ]
 [0.         0.         0.         0.        ]]
Average reward: 0.9
