In [30]:
import random
import numpy as np
from collections import defaultdict

class GridWorld:
    def __init__(self):
        self.rows = 2
        self.cols = 3
        self.goal = (1, 2)  # S23
        self.actions = ["up", "down", "left", "right"]
        self.state = (0, 0)

    def reset(self):
        while True:
            state = (random.randint(0, self.rows - 1), random.randint(0, self.cols - 1))
            if state != self.goal:
                self.state = state
                return state

    def get_valid_actions(self, state):
        row, col = state
        valid = []
        if row > 0:
            valid.append("up")
        if row < self.rows - 1:
            valid.append("down")
        if col > 0:
            valid.append("left")
        if col < self.cols - 1:
            valid.append("right")
        return valid

    def step(self, action):
        row, col = self.state
        if action == "up" and row > 0:
            row -= 1
        elif action == "down" and row < self.rows - 1:
            row += 1
        elif action == "left" and col > 0:
            col -= 1
        elif action == "right" and col < self.cols - 1:
            col += 1

        next_state = (row, col)
        reward = 100 if next_state == self.goal and self.state in [(0, 2), (1, 1)] else 0
        done = (next_state == self.goal)
        self.state = next_state
        return next_state, reward, done

def q_learning(env, episodes=10, gamma=0.9, alpha=0.1, epsilon=0.1):
    Q = defaultdict(lambda: 0.0)

    for ep in range(episodes):
        state = env.reset()
        print(f"\n--- Episode {ep + 1} ---")
        step = 0
        while True:
            valid_actions = env.get_valid_actions(state)
            if random.random() < epsilon:
                action = random.choice(valid_actions)
            else:
                action = max(valid_actions, key=lambda a: Q[(state, a)])
            next_state, reward, done = env.step(action)
            next_valid = env.get_valid_actions(next_state)
            max_q_next = max([Q[(next_state, a)] for a in next_valid], default=0)
            old_q = Q[(state, action)]
            new_q = (1 - alpha) * old_q + alpha * (reward + gamma * max_q_next)

            if abs(new_q - old_q) > 1e-5:  # Only print if Q-value changes
                Q[(state, action)] = new_q
                print(f"Step {step + 1}: S{state[0]+1}{state[1]+1} --{action}--> S{next_state[0]+1}{next_state[1]+1}, R={reward}")
                print(f"    Q updated: {old_q:.2f} -> {new_q:.2f}")
            else:
                Q[(state, action)] = new_q  # still update silently

            state = next_state
            step += 1
            if done:
                print(f"Reached goal: S{next_state[0]+1}{next_state[1]+1}")
                break
        print_q_table(Q)  # Q-table after each episode
    return Q



def print_q_table(Q):
    print("\n=== Q-Table ===")
    states = [(0,0), (0,1), (0,2), (1,0), (1,1), (1,2)]
    for state in states:
        valid = GridWorld().get_valid_actions(state)
        state_label = f"S{state[0]+1}{state[1]+1}"
        print(f"{state_label}:")
        for action in valid:
            print(f"  {action}: {Q[(state, action)]:.2f}")

def demonstrate_policy(env, Q):
    print("\n=== Demonstrating Policy from S11 ===")
    state = (0, 0)
    env.state = state
    path = [f"S{state[0]+1}{state[1]+1}"]
    while state != env.goal:
        actions = env.get_valid_actions(state)
        action = max(actions, key=lambda a: Q[(state, a)])
        next_state, reward, done = env.step(action)
        path.append(f"S{next_state[0]+1}{next_state[1]+1}")
        state = next_state
    print(" -> ".join(path))

def print_value_function(Q):
    print("\n=== Estimated V*(s) and π*(s) ===")
    states = [(0,0), (0,1), (0,2), (1,0), (1,1)]
    for state in states:
        actions = GridWorld().get_valid_actions(state)
        if not actions:
            continue
        values = {a: Q[(state, a)] for a in actions}
        best_action = max(values, key=values.get)
        best_value = values[best_action]
        state_label = f"S{state[0]+1}{state[1]+1}"
        print(f"{state_label}: V* = {best_value:.2f}, π* = {best_action}")

if __name__ == "__main__":
    env = GridWorld()
    Q = q_learning(env, episodes=10)
    print_q_table(Q)
    print_value_function(Q)
    demonstrate_policy(env, Q)



--- Episode 1 ---
Step 3: S13 --down--> S23, R=100
    Q updated: 0.00 -> 10.00
Reached goal: S23

=== Q-Table ===
S11:
  down: 0.00
  right: 0.00
S12:
  down: 0.00
  left: 0.00
  right: 0.00
S13:
  down: 10.00
  left: 0.00
S21:
  up: 0.00
  right: 0.00
S22:
  up: 0.00
  left: 0.00
  right: 0.00
S23:
  up: 0.00
  left: 0.00

--- Episode 2 ---
Step 5: S22 --right--> S23, R=100
    Q updated: 0.00 -> 10.00
Reached goal: S23

=== Q-Table ===
S11:
  down: 0.00
  right: 0.00
S12:
  down: 0.00
  left: 0.00
  right: 0.00
S13:
  down: 10.00
  left: 0.00
S21:
  up: 0.00
  right: 0.00
S22:
  up: 0.00
  left: 0.00
  right: 10.00
S23:
  up: 0.00
  left: 0.00

--- Episode 3 ---
Step 1: S13 --down--> S23, R=100
    Q updated: 10.00 -> 19.00
Reached goal: S23

=== Q-Table ===
S11:
  down: 0.00
  right: 0.00
S12:
  down: 0.00
  left: 0.00
  right: 0.00
S13:
  down: 19.00
  left: 0.00
S21:
  up: 0.00
  right: 0.00
S22:
  up: 0.00
  left: 0.00
  right: 10.00
S23:
  up: 0.00
  left: 0.00

--- Episode 4 