In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import random
import imageio

In [None]:
# Train function

def q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay):
    epsilon = 1

    for episode in range(n_train_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)
        state = env.reset() # get env state

        # print("state type: ", type(state))
        # if state is of type tuple, convret it to a index
        if isinstance(state, tuple):
            state = state[0]

        for step in range(max_steps):

            # choose an action based on a random number - exploration-exploitation trade-off
            if random.uniform(0, 1) > epsilon:
                action = np.argmax(Q[state,:])  # exploit
            else:
                action = env.action_space.sample()  # explore

            # perform the action
            new_state, reward, terminated, truncated, info = env.step(action)
            done = truncated | terminated        

            # update the Q-table with Bellman equation
            Q[state, action] = Q[state, action] + lr * (reward + gamma * np.max(Q[new_state,:]) - Q[state, action])

            # end the episode
            if done == True:
                break
            
            # update the state
            state = new_state

    print("Training completed over", n_train_episodes, "episodes")
    return Q, env

In [None]:
# Evaluation function

def q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay):
    for episode in range(n_eval_episodes):
        state = env.reset() # get env state  
        
        # if state is of type tuple, convret it to a index
        if isinstance(state, tuple):
            state = state[0]      

        for step in range(max_steps):

            action = np.argmax(Q[state,:])

            # perform the action
            new_state, reward, terminated, truncated, info = env.step(action)
            done = truncated | terminated        

            # update the Q-table with Bellman equation
            Q[state, action] = Q[state, action] + lr * (reward + gamma * np.max(Q[new_state, action]) - Q[state, action])
            # update the state
            state = new_state

            # end the episode
            if done == True:
                break
    
    # state = env.reset() # get env state      
    # for step in range(max_steps):
    #     if isinstance(state, tuple):
    #         state = state[0]
    #     action = np.argmax(Q[state, :])
    #     new_state, reward, terminated, truncated, info = env.step(action)
    #     done = truncated | terminated
    #     state = new_state

    #     if done == True:
    #         break
            
    print("Evaluation completed over", n_eval_episodes, "episodes")
    return Q, env

In [None]:
def create_gif(Q, env, filename):
    frames = []
    state = env.reset()
    done = False

    while not done:
        if isinstance(state, tuple):
            state = state[0]
        action = np.argmax(Q[state,:])
        new_state, reward, terminated, truncated, info = env.step(action)
        done = truncated | terminated
        frame = env.render()
        frames.append(frame)
        state = new_state

    imageio.mimsave(filename, frames, fps = 6)
    print(filename, "GIF saved\n")

In [None]:
# FrozenLake 4x4

def frozen_lake_4x4():
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
    env.reset()

    # Setting the parameters
    n_train_episodes = 10000
    lr = 0.7    # learning rate
    n_eval_episodes = 100
    max_steps = 100
    gamma = 0.95
    min_epsilon = 0.05
    max_epsilon = 1.0
    decay = 0.0005

    # Initialize the Q-table
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    Q, env = q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)
    Q, env = q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)

    create_gif(Q, env, "FrozenLake-v1-4x4.gif")

In [None]:
# FrozenLake 4x4 Slippery

def frozen_lake_4x4_slippery():
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="rgb_array")
    env.reset()

    # Setting the parameters
    n_train_episodes = 10000
    lr = 0.7    # learning rate
    n_eval_episodes = 100
    max_steps = 100
    gamma = 0.95
    min_epsilon = 0.05
    max_epsilon = 1.0
    decay = 0.0005

    # Initialize the Q-table
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    Q, env = q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)
    Q, env = q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)

    create_gif(Q, env, "FrozenLake-v1-4x4-slippery.gif")

In [None]:
# FrozenLake 8x8

def frozen_lake_8x8():
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False, render_mode="rgb_array")
    env.reset()

    # Setting the parameters
    n_train_episodes = 250000
    lr = 0.8    # learning rate
    n_eval_episodes = 1000
    max_steps = 400
    gamma = 0.9
    min_epsilon = 0.001
    max_epsilon = 1.0
    decay = 0.00005

    # Initialize the Q-table
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    Q, env = q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)
    Q, env = q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)

    create_gif(Q, env, "FrozenLake-v1-8x8.gif")

In [None]:
# FrozenLake 8x8 Slippery

def frozen_lake_8x8_slippery():
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True, render_mode="rgb_array")
    env.reset()

    # Setting the parameters
    n_train_episodes = 250000
    lr = 0.8    # learning rate
    n_eval_episodes = 1000
    max_steps = 400
    gamma = 0.9
    min_epsilon = 0.001
    max_epsilon = 1.0
    decay = 0.00005

    # Initialize the Q-table
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    Q, env = q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)
    Q, env = q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)

    create_gif(Q, env, "FrozenLake-v1-8x8-slippery.gif")

In [None]:
# Taxi

def taxi():
    env = gym.make("Taxi-v3", render_mode="rgb_array")
    env.reset()

    # Setting the parameters
    n_train_episodes = 10000
    lr = 0.7    # learning rate
    n_eval_episodes = 100
    max_steps = 100
    gamma = 0.95
    min_epsilon = 0.05
    max_epsilon = 1.0
    decay = 0.0005

    # Initialize the Q-table
    Q = np.zeros((env.observation_space.n, env.action_space.n)) # 500 states, 6 actions
    Q, env = q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)
    Q, env = q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)

    create_gif(Q, env, "Taxi-v3.gif")

In [None]:
# CliffWalking

def cliff_walking():
    env = gym.make("CliffWalking-v0", render_mode="rgb_array")
    env.reset()

    # Setting the parameters
    n_train_episodes = 10000
    lr = 0.7    # learning rate
    n_eval_episodes = 100
    max_steps = 100
    gamma = 0.95
    min_epsilon = 0.05
    max_epsilon = 1.0
    decay = 0.0005

    # Initialize the Q-table
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    Q, env = q_learning(Q, env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)
    Q, env = q_evaluation(Q, env, n_eval_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay)

    create_gif(Q, env, "CliffWalking-v0.gif")

In [None]:
frozen_lake_4x4()
frozen_lake_4x4_slippery()
cliff_walking()
taxi()
frozen_lake_8x8()
frozen_lake_8x8_slippery()