# Please Win Blackjack

**Episodes: 1,000,000**
Results:
- Win Rate: 41.78% :)
- Draw Rate: 8.44%
- Loss Rate: 49.78%

In [1]:
# Required Libraries
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict
import matplotlib.pyplot as plt

# Function to Initialize the Q-table, policy, and returns
def initialize_mc_soft():
    Q = defaultdict(lambda: np.zeros(2))
    returns = defaultdict(list)
    policy = defaultdict(lambda: {0: 0.5, 1: 0.5})  # Initialize equal probabilities

    return Q, returns, policy

Q, returns, policy = initialize_mc_soft()

# Generate Episode with Epsilon-Soft Policy
def generate_episode(env, Q, epsilon=0.1):
    episode = []
    state, _ = env.reset()

    while True:
        # Choose action using epsilon-soft policy
        if state in Q and np.any(Q[state]):  # If Q-values exist for this state
            if np.random.rand() < epsilon:
                action = random.choice([0, 1])  # Random action (exploration)
            else:
                action = np.argmax(Q[state])  # Greedy action (exploitation)
        else:
            action = random.choice([0, 1])  # Random action if state unseen

        # Take action
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))

        if terminated or truncated:
            break

        state = next_state  # Move to the next state

    return episode

# Monte Carlo Update with First-Visit Control and Policy Fixes
def update_q_policy(episode, Q, returns, policy, epsilon=0.1):
    G = 0
    visited = set()  # Track first-visit state-action pairs

    for state, action, reward in reversed(episode):
        G = reward + G  # Compute return

        if (state, action) not in visited:  # First-visit check
            visited.add((state, action))  # Mark as visited
            returns[(state, action)].append(G)  # Store return
            Q[state][action] = np.mean(returns[(state, action)])  # Average return

            # Find the best action (greedy action)
            A_star = np.argmax(Q[state])

            # Ensure policy dictionary structure exists
            if state not in policy:
                policy[state] = {0: 0.5, 1: 0.5}

            # Update the epsilon-soft policy
            for a in [0, 1]:  # Stick (0) or Hit (1)
                if a == A_star:
                    policy[state][a] = 1 - epsilon + (epsilon / 2)  # Best action gets highest probability
                else:
                    policy[state][a] = epsilon / 2  # Other action gets small probability

    return Q, policy

# Training Function with Epsilon Decay
def train_mc_soft(env, num_episodes=1000000, epsilon_start=0.1, epsilon_decay=0.99, epsilon_min=0.01):
    Q, returns, policy = initialize_mc_soft()

    for episode_num in range(num_episodes):
        epsilon = max(epsilon_min, epsilon_start * (epsilon_decay ** episode_num))  # Decay epsilon over time

        episode = generate_episode(env, Q, epsilon)
        Q, policy = update_q_policy(episode, Q, returns, policy, epsilon)

        # Print progress occasionally
        if episode_num % 50000 == 0:
            print(f"Episode {episode_num}/{num_episodes} completed. Current epsilon: {epsilon:.4f}")

    return Q, policy

# Create Blackjack environment
env = gym.make("Blackjack-v1", natural=True, sab=True)

# Train the agent with fixed Monte Carlo Control using epsilon-soft policy
Q, policy = train_mc_soft(env)

print("Training complete!")

Episode 0/1000000 completed. Current epsilon: 0.1000
Episode 50000/1000000 completed. Current epsilon: 0.0100
Episode 100000/1000000 completed. Current epsilon: 0.0100
Episode 150000/1000000 completed. Current epsilon: 0.0100
Episode 200000/1000000 completed. Current epsilon: 0.0100
Episode 250000/1000000 completed. Current epsilon: 0.0100
Episode 300000/1000000 completed. Current epsilon: 0.0100
Episode 350000/1000000 completed. Current epsilon: 0.0100
Episode 400000/1000000 completed. Current epsilon: 0.0100
Episode 450000/1000000 completed. Current epsilon: 0.0100
Episode 500000/1000000 completed. Current epsilon: 0.0100
Episode 550000/1000000 completed. Current epsilon: 0.0100
Episode 600000/1000000 completed. Current epsilon: 0.0100
Episode 650000/1000000 completed. Current epsilon: 0.0100
Episode 700000/1000000 completed. Current epsilon: 0.0100
Episode 750000/1000000 completed. Current epsilon: 0.0100
Episode 800000/1000000 completed. Current epsilon: 0.0100
Episode 850000/10000

In [2]:
def evaluate_policy(env, policy, num_episodes=10000):
    """
    Evaluates the learned policy by playing multiple episodes.
    
    Args:
    - env: The Blackjack environment.
    - policy: The learned epsilon-soft policy.
    - num_episodes: Number of episodes to test the policy.

    Returns:
    - win_rate: Percentage of games won.
    - draw_rate: Percentage of games drawn.
    - loss_rate: Percentage of games lost.
    """
    wins, draws, losses = 0, 0, 0

    for _ in range(num_episodes):
        state, _ = env.reset()

        while True:
            # Get action from learned policy (if state is known, otherwise pick randomly)
            if state in policy:
                action = np.argmax(list(policy[state].values()))  # Choose action with highest probability
            else:
                action = random.choice([0, 1])  # Random action if state is unseen
            
            next_state, reward, terminated, truncated, _ = env.step(action)

            if terminated or truncated:
                if reward == 1:
                    wins += 1
                elif reward == 0:
                    draws += 1
                else:
                    losses += 1
                break
            
            state = next_state  # Move to next state

    # Compute win, draw, and loss rates
    win_rate = wins / num_episodes
    draw_rate = draws / num_episodes
    loss_rate = losses / num_episodes

    return win_rate, draw_rate, loss_rate

In [11]:
# Evaluate the trained agent
win_rate, draw_rate, loss_rate = evaluate_policy(env, policy, 10000)

print(f"Win Rate: {win_rate:.2%}")
print(f"Draw Rate: {draw_rate:.2%}")
print(f"Loss Rate: {loss_rate:.2%}")

Win Rate: 41.78%
Draw Rate: 8.44%
Loss Rate: 49.78%
