# Q-Learning (5x5 Gridworld)

This notebook implements **Q-learning** for a 5x5 gridworld.

**Environment**
- Grid: 5x5
- Actions: Up, Down, Left, Right
- Reward: +10 at goal, -1 otherwise
- Terminal: goal state

**Hyperparameters (from the question)**
- Learning rate: α = 0.5
- Discount: γ = 0.9

**Output**
- Learned Q-table after training

In [None]:
#imports
import numpy as np
import random
from typing import Tuple

In [1]:
# Setup
GRID_SIZE = 5
GOAL_STATE = (4, 4)

# Required by the question
ALPHA = 0.5
GAMMA = 0.9

# Training settings (fixed episodes as required)
EPISODES = 5000
MAX_STEPS = 200

# Exploration parameter (epsilon-greedy)
EPSILON = 0.1

ACTIONS = ["U", "D", "L", "R"]
ACTION_TO_DELTA = {"U":(-1,0), "D":(1,0), "L":(0,-1), "R":(0,1)}

In [3]:
# Environment Step 
def in_bounds(r: int, c: int) -> bool:
    return 0 <= r < GRID_SIZE and 0 <= c < GRID_SIZE

def step(state: Tuple[int,int], action: str):
    if state == GOAL_STATE:
        return state, 0.0, True

    dr, dc = ACTION_TO_DELTA[action]
    nr, nc = state[0] + dr, state[1] + dc

    if not in_bounds(nr, nc):
        nr, nc = state

    next_state = (nr, nc)
    reward = 10.0 if next_state == GOAL_STATE else -1.0
    done = (next_state == GOAL_STATE)
    return next_state, reward, done



NameError: name 'Tuple' is not defined

In [None]:
# Epsilon-Greedy + Training 
def epsilon_greedy(Q: np.ndarray, state: Tuple[int,int], epsilon: float) -> int:
    # Explore with probability epsilon
    if random.random() < epsilon:
        return random.randint(0, len(ACTIONS)-1)
    # Exploit otherwise
    return int(np.argmax(Q[state[0], state[1], :]))

def train_q_learning():
    # Q-table required: initialized with zeros
    Q = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)), dtype=float)

    for ep in range(EPISODES):
        state = (0, 0)  # fixed start (simple + reproducible)

        for t in range(MAX_STEPS):
            a_idx = epsilon_greedy(Q, state, EPSILON)
            action = ACTIONS[a_idx]

            next_state, reward, done = step(state, action)

            # Q-learning target uses max over next actions
            best_next = 0.0 if done else np.max(Q[next_state[0], next_state[1], :])
            td_target = reward + GAMMA * best_next

            # TD error
            td_error = td_target - Q[state[0], state[1], a_idx]

            # Update rule (required alpha)
            Q[state[0], state[1], a_idx] += ALPHA * td_error

            state = next_state
            if done:
                break

    return Q


In [None]:
# Print Learned Q-Values
Q = train_q_learning()

print("Learned Q-values per state [U, D, L, R]:\n")
for r in range(GRID_SIZE):
    for c in range(GRID_SIZE):
        print(f"State ({r},{c}): {np.round(Q[r,c,:], 3)}")
    print()


In [None]:
# Print Greedy Policy from Q 
def print_greedy_policy(Q: np.ndarray):
    arrow = {"U":"↑", "D":"↓", "L":"←", "R":"→"}
    print("Greedy Policy (from learned Q-table):")
    for r in range(GRID_SIZE):
        row = []
        for c in range(GRID_SIZE):
            if (r,c) == GOAL_STATE:
                row.append("G")
            else:
                best_a = ACTIONS[int(np.argmax(Q[r,c,:]))]
                row.append(arrow[best_a])
        print(" ".join(row))

print_greedy_policy(Q)
