<a href="https://colab.research.google.com/github/hariniiy/RL_1796/blob/main/ASGN_3_1796.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

# ---------------------------
# TD(0) Policy Evaluation
# ---------------------------
def td0_policy_evaluation(policy, env, num_episodes=5000, alpha=0.1, gamma=0.99):
    """
    Evaluate a given policy using TD(0).
    """
    V = defaultdict(float)

    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            V[state] = V[state] + alpha * (reward + gamma * V[next_state] - V[state])
            state = next_state
            done = terminated or truncated

    return V

# ---------------------------
# SARSA Control (On-policy TD)
# ---------------------------
def sarsa_control(env, num_episodes=5000, alpha=0.1, gamma=0.99, epsilon=0.1):
    """
    SARSA algorithm for learning an epsilon-greedy policy.
    """
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    def epsilon_greedy(state):
        if np.random.rand() < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        state, _ = env.reset()
        action = epsilon_greedy(state)
        done = False

        while not done:
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_action = epsilon_greedy(next_state)

            # TD update
            Q[state][action] = Q[state][action] + alpha * (
                reward + gamma * Q[next_state][next_action] - Q[state][action]
            )

            state, action = next_state, next_action
            done = terminated or truncated

    # Derive final policy
    policy = {}
    for state in Q:
        policy[state] = np.argmax(Q[state])

    return policy, Q

# ---------------------------
# Run Example with FrozenLake
# ---------------------------
env = gym.make("FrozenLake-v1", is_slippery=True)

# Define a random policy for evaluation
def random_policy(state):
    return np.random.choice(env.action_space.n)

# TD(0) Policy Evaluation
V = td0_policy_evaluation(random_policy, env, num_episodes=10000)
print("TD(0) Value Function (sample states):")
print({k: round(v, 3) for k, v in list(V.items())[:10]})

# SARSA Control
optimal_policy, Q = sarsa_control(env, num_episodes=50000, epsilon=0.1)
print("\nSARSA Learned Policy (sample states):")
for k, v in list(optimal_policy.items())[:10]:
    print(f"State: {k}, Best Action: {v}")


TD(0) Value Function (sample states):
{0: 0.009, 1: 0.006, 5: 0.0, 4: 0.012, 8: 0.023, 12: 0.0, 2: 0.012, 3: 0.002, 7: 0.0, 6: 0.021}

SARSA Learned Policy (sample states):
State: 0, Best Action: 0
State: 4, Best Action: 0
State: 8, Best Action: 3
State: 12, Best Action: 0
State: 1, Best Action: 3
State: 5, Best Action: 0
State: 9, Best Action: 1
State: 2, Best Action: 0
State: 6, Best Action: 0
State: 10, Best Action: 0
