# Please Win Blackjack

**Episodes: 1,000,000**
Results:
- Win Rate: 
- Draw Rate: 
- Loss Rate:

In [6]:
# Required Libraries
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict
import matplotlib.pyplot as plt

# Function to Initialize V(s) and Returns storage
def initialize_v():
    V = defaultdict(float)  # Value function initialized arbitrarily
    returns = defaultdict(list)  # Returns storage for every state
    return V, returns

V, returns = initialize_v()

# Generate an episode using the behavior policy
def generate_episode(env, behavior_policy):
    """
    Generates an episode following the behavior policy.
    
    Args:
    - env: The RL environment.
    - behavior_policy: A function that selects an action given a state.

    Returns:
    - episode: A list of (state, action, reward) tuples.
    """
    episode = []
    state, _ = env.reset()

    while True:
        action = behavior_policy(state)  # Select action using behavior policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))

        if terminated or truncated:
            break

        state = next_state  # Move to the next state

    return episode

# Off-Policy Monte Carlo Update
def update_value_function(episode, V, returns, target_policy, behavior_policy):
    """
    Updates the value function using off-policy every-visit Monte Carlo with importance sampling.

    Args:
    - episode: A list of (state, action, reward) tuples.
    - V: The value function to update.
    - returns: A dictionary storing returns for each state.
    - target_policy: The policy to evaluate.
    - behavior_policy: The policy that generated the data.

    Returns:
    - Updated V.
    """
    G = 0  # Initialize return
    W = 1  # Initialize importance sampling weight
    visited_states = set()

    for state, action, reward in reversed(episode):
        G = G + reward  # Compute return

        if state not in visited_states:  # Every-visit MC
            visited_states.add(state)
            returns[state].append(G)
            V[state] = np.mean(returns[state])  # Update V(s)

            # Importance sampling ratio
            W *= target_policy(state) / behavior_policy(state)  # pi(A|S) / b(A|S)

            if W == 0:
                break  # Stop updates if importance sampling weight is 0

    return V

# Training Function
def train_off_policy_mc(env, behavior_policy, target_policy, num_episodes=100000):
    """
    Trains an off-policy Monte Carlo agent.

    Args:
    - env: The RL environment.
    - behavior_policy: The policy used to generate episodes.
    - target_policy: The policy being evaluated.
    - num_episodes: Number of episodes for training.

    Returns:
    - V: Estimated value function.
    """
    V, returns = initialize_v()

    for episode_num in range(num_episodes):
        episode = generate_episode(env, behavior_policy)  # Use explicitly passed behavior policy
        V = update_value_function(episode, V, returns, target_policy, behavior_policy)  # Pass both policies

        # Print progress occasionally
        if episode_num % 10000 == 0:
            print(f"Episode {episode_num}/{num_episodes} completed.")

    return V

# Define the behavior policy (b)
def behavior_policy(state):
    """
    Behavior policy that generates episodes.
    Returns:
    - Action: Random choice between Stick (0) and Hit (1).
    """
    return random.choice([0, 1])  # Random action selection

# Define the target policy (pi)
def target_policy(state):
    """
    Target policy that we want to evaluate.
    Returns:
    - Action: 0 (Stick) if state >= 20, else 1 (Hit).
    """
    player_sum, dealer_card, usable_ace = state
    return 0 if player_sum >= 20 else 1  # Stick at 20 or higher

# Create Blackjack environment
env = gym.make("Blackjack-v1", natural=True, sab=True)

# Train the agent using off-policy MC prediction
V_estimates = train_off_policy_mc(env, behavior_policy, target_policy)

print("Training complete!")

# Evaluation Function
def evaluate_value_function(V):
    """
    Prints the estimated value function.

    Args:
    - V: The estimated value function.
    """
    print("\nEstimated Value Function:")
    for state in sorted(V.keys()):
        print(f"V({state}) = {V[state]:.3f}")

# Evaluate the trained agent
evaluate_value_function(V_estimates)

TypeError: '>=' not supported between instances of 'tuple' and 'int'