In [3]:
# FrozenLake Random Agent
# Goal: understand how an MDP works (states, actions, rewards).
# trying this before applying Q-learning.

import gymnasium as gym
import numpy as np

#first we need to create the environment
#FrozenLake-v1 is a 4x4 grid with:
# S = Start, G = Goal, H = Hole (danger), F = Frozen (safe)
#the agent must learn a safe path from S to G.
#is_slippery=False removes randomness.
env = gym.make("FrozenLake-v1", is_slippery=False)

#reset environment
state, info = env.reset()
print("Starting state:", state)  # Always 0 (top-left corner)

#run one episode with random actions
total_reward = 0
print(" Running a random episode")

for step in range(10):  # limit to 10 moves just for demo
    action = env.action_space.sample()  # choose a random action (0–3)
    next_state, reward, terminated, truncated, info = env.step(action)

    print(f"Step {step}: action={action}, next_state={next_state}, reward={reward}")

    total_reward += reward
    state = next_state

    if terminated or truncated:
        print("Episode ended.")
        break

print("Total reward collected:", total_reward)

#Reward is only 1 if we reach the Goal (G).
#Otherwise, reward = 0 (walking or falling in a hole).
#This exercise shows how MDPs work: each action leads to a new state,
#and rewards depend on sequences, not one single choice (like in bandits).


Starting state: 0
 Running a random episode
Step 0: action=0, next_state=0, reward=0.0
Step 1: action=0, next_state=0, reward=0.0
Step 2: action=3, next_state=0, reward=0.0
Step 3: action=1, next_state=4, reward=0.0
Step 4: action=2, next_state=5, reward=0.0
Episode ended.
Total reward collected: 0.0


In [None]:
#To make it learn over time we’d need to store a Q-table (next step)