# Frozen Lake 8X8 from M. Yildiz

In [None]:
#pip install gym

In [None]:
#pip install pygame

In [None]:
#pip install gym[toy_text]

In [None]:
#pip install --upgrade gym

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Simple Version

def run(episodes, is_training=True, render=False):

    env = gym.make('FrozenLake-v1', map_name="8x8", is_slippery=True, render_mode='human' if render else None)

    if(is_training):
        q = np.zeros((env.observation_space.n, env.action_space.n)) # init a 64 x 4 array
    else:
        f = open('frozen_lake8x8.pkl', 'rb')
        q = pickle.load(f)
        f.close()

    learning_rate_a = 0.9 # alpha or learning rate
    discount_factor_g = 0.9 # gamma or discount rate. Near 0: more weight/reward placed on immediate state. Near 1: more on future state.
    epsilon = 1         # 1 = 100% random actions
    epsilon_decay_rate = 0.0001        # epsilon decay rate. 1/0.0001 = 10,000
    rng = np.random.default_rng()   # random number generator

    rewards_per_episode = np.zeros(episodes)

    for i in range(episodes):
        state = env.reset()[0]  # states: 0 to 63, 0=top left corner,63=bottom right corner
        terminated = False      # True when fall in hole or reached goal
        truncated = False       # True when actions > 200

        while(not terminated and not truncated):
            if is_training and rng.random() < epsilon:
                action = env.action_space.sample() # actions: 0=left,1=down,2=right,3=up
            else:
                action = np.argmax(q[state,:])

            new_state,reward,terminated,truncated,_ = env.step(action)

            if is_training:
                q[state,action] = q[state,action] + learning_rate_a * (
                    reward + discount_factor_g * np.max(q[new_state,:]) - q[state,action]
                )

            state = new_state

        epsilon = max(epsilon - epsilon_decay_rate, 0)

        if(epsilon==0):
            learning_rate_a = 0.0001

        if reward == 1:
            rewards_per_episode[i] = 1

    env.close()

    sum_rewards = np.zeros(episodes)
    for t in range(episodes):
        sum_rewards[t] = np.sum(rewards_per_episode[max(0, t-100):(t+1)])
    plt.plot(sum_rewards)
    plt.savefig('frozen_lake8x8.png')

    if is_training:
        f = open("frozen_lake8x8.pkl","wb")
        pickle.dump(q, f)
        f.close()

if __name__ == '__main__':
    # run(15000)

    run(20000, is_training=True, render=True)

* +1 reward for moving closer to the goal.
* +100 reward for reaching the goal.
* -10 penalty for falling into a hole.
* -1 penalty for moving farther away from the goal.

In [None]:
# Advanced Version
import gym
import numpy as np
import pickle
import time

def run(episodes, is_training=True, render=False):

    env = gym.make('FrozenLake-v1', map_name="8x8", is_slippery=True, render_mode='human' if render else None)

    if is_training:
        q = np.zeros((env.observation_space.n, env.action_space.n))  # init a 64 x 4 array
    else:
        with open('frozen_lake8x8.pkl', 'rb') as f:
            q = pickle.load(f)

    learning_rate_a = 0.9  # alpha or learning rate
    discount_factor_g = 0.9  # gamma or discount rate
    epsilon = 1.0  # Start with 100% random actions
    epsilon_decay_rate = 0.00001  # Epsilon decay rate
    rng = np.random.default_rng()  # Random number generator

    goal_position = 63  # The goal is the bottom-right corner in the 8x8 grid (index 63)
    
    rewards_per_episode = np.zeros(episodes)
    total_rewards = []  # Store total rewards for each episode

    for i in range(episodes):
        state = env.reset()[0]  # Get initial state
        terminated = False  # True when fall in hole or reached goal
        truncated = False  # True when actions > 200
        episode_reward = 0  # Reward for the current episode

        while not terminated and not truncated:
            if is_training and rng.random() < epsilon:
                action = env.action_space.sample()  # Explore: Random action
            else:
                action = np.argmax(q[state, :])  # Exploit: Best action

            new_state, reward, terminated, truncated, _ = env.step(action)

            # Calculate distance to goal
            current_distance = abs(goal_position - state)
            new_distance = abs(goal_position - new_state)

            # Rewards and penalties based on the agent's movement
            if terminated and new_state == goal_position:
                reward = 100  # Reached the goal, +100 points
            elif terminated and reward == 0:  # Fell into a hole
                reward = -10  # Fell into a hole, -10 penalty
            elif new_distance < current_distance:
                reward = 1  # Moving closer to the goal, +1 point
            elif new_distance > current_distance:
                reward = -1  # Moving farther from the goal, -1 penalty

            if is_training:
                q[state, action] = q[state, action] + learning_rate_a * (
                    reward + discount_factor_g * np.max(q[new_state, :]) - q[state, action]
                )

            state = new_state
            episode_reward += reward  # Accumulate reward for this episode

            # Render the environment and display the current reward if render=True
            if render:
                env.render()
                print(f"Step Reward: {reward} | Total Reward for Episode {i+1}: {episode_reward}")
                time.sleep(0.1)  # Adding a small delay to see the rendering

        # Update epsilon and learning rate
        epsilon = max(epsilon - epsilon_decay_rate, 0)

        if epsilon == 0:
            learning_rate_a = 0.0001

        rewards_per_episode[i] = episode_reward  # Store reward for this episode
        total_rewards.append(episode_reward)  # Add to the total rewards list

    env.close()

    # Print the total rewards at the end of the run
    print(f"Total rewards over {episodes} episodes: {np.sum(total_rewards)}")
    print(f"Average reward per episode: {np.mean(total_rewards)}")

if __name__ == '__main__':
    # Training with rendering to visualize rewards in real-time
    run(10000, is_training=True, render=True)


  if not isinstance(terminated, (bool, np.bool8)):


Step Reward: 1 | Total Reward for Episode 1: 1
Step Reward: 1 | Total Reward for Episode 1: 2
Step Reward: 1 | Total Reward for Episode 1: 3
Step Reward: 1 | Total Reward for Episode 1: 4
Step Reward: -10 | Total Reward for Episode 1: -6
Step Reward: 0.0 | Total Reward for Episode 2: 0.0
Step Reward: 0.0 | Total Reward for Episode 2: 0.0
Step Reward: 0.0 | Total Reward for Episode 2: 0.0
Step Reward: 0.0 | Total Reward for Episode 2: 0.0
Step Reward: 1 | Total Reward for Episode 2: 1.0
Step Reward: 0.0 | Total Reward for Episode 2: 1.0
Step Reward: 1 | Total Reward for Episode 2: 2.0
Step Reward: 1 | Total Reward for Episode 2: 3.0
Step Reward: -1 | Total Reward for Episode 2: 2.0
Step Reward: 1 | Total Reward for Episode 2: 3.0
Step Reward: 1 | Total Reward for Episode 2: 4.0
Step Reward: -1 | Total Reward for Episode 2: 3.0
Step Reward: -1 | Total Reward for Episode 2: 2.0
Step Reward: 1 | Total Reward for Episode 2: 3.0
Step Reward: -1 | Total Reward for Episode 2: 2.0
Step Reward: 

Step Reward: -1 | Total Reward for Episode 9: 3.0
Step Reward: 1 | Total Reward for Episode 9: 4.0
Step Reward: 1 | Total Reward for Episode 9: 5.0
Step Reward: 1 | Total Reward for Episode 9: 6.0
Step Reward: 1 | Total Reward for Episode 9: 7.0
Step Reward: 1 | Total Reward for Episode 9: 8.0
Step Reward: 1 | Total Reward for Episode 9: 9.0
Step Reward: -1 | Total Reward for Episode 9: 8.0
Step Reward: 1 | Total Reward for Episode 9: 9.0
Step Reward: -1 | Total Reward for Episode 9: 8.0
Step Reward: -10 | Total Reward for Episode 9: -2.0
Step Reward: 0.0 | Total Reward for Episode 10: 0.0
Step Reward: 0.0 | Total Reward for Episode 10: 0.0
Step Reward: 0.0 | Total Reward for Episode 10: 0.0
Step Reward: 0.0 | Total Reward for Episode 10: 0.0
Step Reward: 1 | Total Reward for Episode 10: 1.0
Step Reward: 0.0 | Total Reward for Episode 10: 1.0
Step Reward: 1 | Total Reward for Episode 10: 2.0
Step Reward: 1 | Total Reward for Episode 10: 3.0
Step Reward: 0.0 | Total Reward for Episode 1