<a href="https://colab.research.google.com/github/hariniiy/RL_1796/blob/main/ASGN_2_1796.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

# ---------------------------
# Monte Carlo Policy Evaluation
# ---------------------------
def mc_policy_evaluation(policy, env, num_episodes=500000, gamma=1.0):
    """
    Evaluate a given policy using first-visit Monte Carlo.
    """
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = defaultdict(float)

    for i_episode in range(1, num_episodes + 1):
        episode = []
        state, _ = env.reset()
        done = False

        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            done = terminated or truncated

        # Compute returns
        G = 0
        visited_states = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if state not in visited_states:
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]
                visited_states.add(state)

    return V

# ---------------------------
# Monte Carlo Control (ε-greedy)
# ---------------------------
def mc_control_epsilon_greedy(env, num_episodes=500000, gamma=1.0, epsilon=0.1):
    """
    Monte Carlo control using epsilon-greedy policy improvement.
    """
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    def policy_fn(state):
        # ε-greedy action selection
        if np.random.rand() < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(Q[state])

    for i_episode in range(1, num_episodes + 1):
        episode = []
        state, _ = env.reset()
        done = False

        while not done:
            action = policy_fn(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            done = terminated or truncated

        # Compute returns
        G = 0
        visited_state_actions = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if (state, action) not in visited_state_actions:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[state][action] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited_state_actions.add((state, action))

    # Derive final greedy policy
    policy = {}
    for state in Q:
        policy[state] = np.argmax(Q[state])

    return policy, Q

# ---------------------------
# Run Example with Blackjack
# ---------------------------
env = gym.make("Blackjack-v1")

# Define a random policy for evaluation
def random_policy(state):
    return np.random.choice(env.action_space.n)

# Monte Carlo Policy Evaluation
V = mc_policy_evaluation(random_policy, env, num_episodes=100000)
print("Monte Carlo Policy Evaluation (state values for random policy):")
print(dict(list(V.items())[:5]))  # print sample

# Monte Carlo Control
optimal_policy, Q = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)
print("\nMonte Carlo Control (optimal policy sample):")
for k, v in list(optimal_policy.items())[:5]:
    print(f"State: {k}, Best Action: {v}")


Monte Carlo Policy Evaluation (state values for random policy):
{(18, 2, 0): -0.33640552995391704, (9, 2, 0): -0.3010033444816054, (15, 4, 1): -0.2727272727272727, (20, 10, 0): -0.2345999527967902, (10, 5, 0): -0.13910761154855644}

Monte Carlo Control (optimal policy sample):
State: (20, 9, 0), Best Action: 0
State: (16, 10, 0), Best Action: 1
State: (13, 10, 1), Best Action: 1
State: (20, 6, 0), Best Action: 0
State: (15, 10, 0), Best Action: 1
