<a href="https://colab.research.google.com/github/janbanot/msc-cs-code/blob/main/sem3/DL/DL_2025_Lab7-a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Opcjonalna instalacja gymnasium

!uv pip install gymnasium
!uv pip install "gymnasium[toy-text]"

In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

def enable_video(env, interval=200):
    """
    Monkey-patch `env` so that:
      - env.frames will accumulate all rgb_array frames whenever you call env.render()
      - env.get_video() returns an IPython.display.HTML object of the animation.
    """
    if env.render_mode != 'rgb_array':
        env.get_video = lambda : "Render mode should be 'rgb_array' to get a video"
        return env  # No recording

    env.frames = []
    orig_render = env.render

    def _render_and_capture(*args, **kwargs):
        frame = orig_render(*args, **kwargs)  # get the RGB array
        env.frames.append(frame)
        return frame

    env.render = _render_and_capture

    def get_video():
        if not env.frames:
            return "No frames in env.frames; make sure you called env.render() at least once."
        fig, ax = plt.subplots(figsize=(3, 3), tight_layout=True)
        ax.axis("off")
        img = ax.imshow(env.frames[0], interpolation='none', animated=True)

        def _update(i):
            img.set_array(env.frames[i])
            return (img,)

        ani = FuncAnimation(fig, _update,
                            frames=len(env.frames), interval=interval,
                            blit=True, repeat=False)
        plt.close(fig)
        return HTML(ani.to_html5_video())

    env.get_video = get_video
    return env

In [None]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False,
               render_mode='rgb_array')

enable_video(env)

state, info = env.reset()
terminated = False
truncated = False

while not (terminated or truncated):
    action = env.action_space.sample()  # Losowa akcja
    # Przejście do kolejnego stanu
    state, reward, terminated, truncated, info = env.step(action)
    print(f'{state = }')

    if env.render_mode:
        env.render()   # Dodaje kolejną klatkę animacji

env.close()

env.get_video()

# Q-learning

In [None]:
import numpy as np

def train(episodes):
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False,
                render_mode=None)

    q = np.zeros((env.observation_space.n, env.action_space.n))
    print(f'{q.shape = }')

    learning_rate = 0.9
    discount_factor = 0.9

    epsilon = 1
    epsilon_decay_rate = 1 / episodes
    rng = np.random.default_rng()
    rewards = np.zeros(episodes)

    for i in range(episodes):
        state = env.reset()[0]
        terminated = False
        truncated = False

        while not (terminated or truncated):
            if rng.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q[state, :])

            new_state, reward, terminated, truncated, _ = env.step(action)

            # Aktualizacja funkcji (tablicy) stanu-akcji
            q[state, action] = q[state, action] + learning_rate * (
                reward + discount_factor * np.max(q[new_state, :]) - q[state, action]
            )
            rewards[i] += reward

            state = new_state

        epsilon = max(epsilon - epsilon_decay_rate, 0)
        learning_rate = max(learning_rate - 1 / episodes, 1e-4)

    env.close()

    sum_rewards = np.zeros(episodes)
    for t in range(episodes):
        sum_rewards[t] = np.sum(rewards[max(0, t-100) : t+1])

    plt.plot(sum_rewards)
    plt.xlabel('Episode')
    plt.ylabel('Reward (past 100 episodes)')
    plt.show()
    return q

q_table = train(15_000)

In [None]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False,
               render_mode='rgb_array')

enable_video(env, interval=300)

state, info = env.reset()
terminated = False
truncated = False
while not (terminated or truncated):
    action = np.argmax(q_table[state, :])
    state, reward, terminated, truncated, info = env.step(action)
    env.render()

env.close()

env.get_video()

# Zad. 1.

Sprawdź działanie algorytmu Q-learning dla problemu FrozenLake-v1 i mapy "8x8".
* Rozważ wariant `is_slippery = False` oraz `is_slippery = True`
* Uruchom algorytm kilka razy, czy zawsze otrzymujemy zbieżność?

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

def train_flexible(episodes, map_name="4x4", is_slippery=False, run_id=1):
    # Tworzenie środowiska z zadanymi parametrami
    env = gym.make("FrozenLake-v1", map_name=map_name, is_slippery=is_slippery, render_mode=None)

    q = np.zeros((env.observation_space.n, env.action_space.n))

    # Hiperparametry
    learning_rate = 0.9
    discount_factor = 0.9

    epsilon = 1.0
    epsilon_decay_rate = 1 / episodes

    rng = np.random.default_rng()
    rewards = np.zeros(episodes)

    for i in range(episodes):
        state = env.reset()[0]
        terminated = False
        truncated = False

        while not (terminated or truncated):
            if rng.random() < epsilon:
                action = env.action_space.sample()
            else:
                # Wybór najlepszej akcji z Q-tabeli
                # Dodajemy losowy szum przy remisach, aby uniknąć utknięcia w argmax(0)
                action = np.argmax(q[state, :] + rng.random(env.action_space.n) * 1e-9)

            new_state, reward, terminated, truncated, _ = env.step(action)

            # Aktualizacja Q-learning
            q[state, action] = q[state, action] + learning_rate * (
                reward + discount_factor * np.max(q[new_state, :]) - q[state, action]
            )
            rewards[i] += reward
            state = new_state

        epsilon = max(epsilon - epsilon_decay_rate, 0)
        learning_rate = max(learning_rate - 1 / episodes, 1e-4)

    env.close()

    # Rysowanie wykresu średniej nagrody
    sum_rewards = np.zeros(episodes)
    window = 500  # Okno uśredniania
    for t in range(episodes):
        sum_rewards[t] = np.sum(rewards[max(0, t-window) : t+1]) / min(t+1, window)

    plt.plot(sum_rewards, label=f'Run {run_id}')
    return q, np.mean(rewards[-1000:]) # Zwracamy Q i średnią z końcowych epizodów

# --- EKSPERYMENTY ---

# 1. Mapa 8x8, is_slippery=False (Środowisko deterministyczne)
print(f"--- Eksperyment: 8x8, is_slippery=False ---")
plt.figure(figsize=(10, 5))
plt.title("Learning Curve: 8x8, Non-Slippery")
plt.xlabel("Episode")
plt.ylabel("Average Reward (running window)")

for i in range(3):
    # Wystarczy 10,000 epizodów, bo środowisko jest przewidywalne
    _, final_score = train_flexible(episodes=10_000, map_name="8x8", is_slippery=False, run_id=i+1)
    print(f"Run {i+1}: Final Avg Reward = {final_score:.2f}")

plt.legend()
plt.show()

# 2. Mapa 8x8, is_slippery=True (Środowisko stochastyczne)
print(f"\n--- Eksperyment: 8x8, is_slippery=True ---")
plt.figure(figsize=(10, 5))
plt.title("Learning Curve: 8x8, Slippery")
plt.xlabel("Episode")
plt.ylabel("Average Reward (running window)")

for i in range(3):
    # Tutaj potrzebujemy znacznie więcej epizodów (np. 30,000),
    # ponieważ nagroda jest rzadka, a ruch losowy.
    _, final_score = train_flexible(episodes=30_000, map_name="8x8", is_slippery=True, run_id=i+1)
    print(f"Run {i+1}: Final Avg Reward = {final_score:.2f}")

plt.legend()
plt.show()