# Please Win Blackjack

**Episodes: 1,000,000**
Results:
- Win Rate: 
- Draw Rate: 
- Loss Rate: 

In [1]:
# Required Libraries
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
# Function to Initialize the Q-table, policy, and returns
def initialize_mc_soft():
    Q = defaultdict(lambda: np.zeros(2))
    returns = defaultdict(list)
    policy = {}

    return Q, returns, policy

Q, returns, policy = initialize_mc_soft()

In [3]:
def generate_episode(env, Q, epsilon=0.1):
    episode = []

    state, _ = env.reset()

    while True:
        # Choose action using epsilon-soft policy
        if state in Q and np.any(Q[state]): # If Q-values exist for this state
            if np.random.rand() < epsilon:
                action = random.choice([0, 1]) # Random action (exploration)
            else:
                action = np.argmax(Q[state]) # Greedy action (exploitation)
        else:
            action = random.choice([0, 1]) # Random action if state unseen

        # Take action
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))

        if terminated or truncated:
            break

        state = next_state # Move to the next state

    return episode

In [4]:
# Monte Carlo Update
def update_q_policy(episode, Q, returns, policy, epsilon=0.1):
    G = 0
    for state, action, reward in reversed(episode):
        G = reward + G
        returns[(state, action)].append(G)
        Q[state][action] = np.mean(returns[(state, action)])

        A_star = np.argmax(Q[state])

        for a in [0, 1]: # Actions: 0 = Stick, 1 = Hit
            if a == A_star:
                policy[state][a] = 1 - epsilon + (epsilon / 2)
            else:
                policy[state][a] = epsilon / 2

    return Q, policy

In [5]:
def train_mc_soft(env, num_episodes=500000, epsilon=0.1):
    Q, returns, policy = initialize_mc_soft()

    for episode_num in range(num_episodes):
        episode = generate_episode(env, Q, epsilon)
        Q, policy = update_q_policy(episode, Q, returns, policy, epsilon)

        if episode_num % 100000 == 0:
            print(f"Episode {episode_num}/{num_episodes} completed.")

    return Q, policy

In [6]:
# Create Blackjack environment
env = gym.make("Blackjack-v1", natural=True, sab=True)

# Train the agent
Q, policy = train_mc_soft(env, num_episodes=1000000, epsilon=0.2)

print("Training complete!")

Episode 0/1000000 completed.
Episode 100000/1000000 completed.
Episode 200000/1000000 completed.
Episode 300000/1000000 completed.
Episode 400000/1000000 completed.
Episode 500000/1000000 completed.
Episode 600000/1000000 completed.
Episode 700000/1000000 completed.
Episode 800000/1000000 completed.
Episode 900000/1000000 completed.
Training complete!


In [7]:
def evaluate_policy(env, policy, num_episodes=10000):
    """
    Evaluates the learned policy by playing multiple episodes.
    
    Args:
    - env: The Blackjack environment.
    - policy: The learned epsilon-soft policy.
    - num_episodes: Number of episodes to test the policy.

    Returns:
    - win_rate: Percentage of games won.
    - draw_rate: Percentage of games drawn.
    - loss_rate: Percentage of games lost.
    """
    wins, draws, losses = 0, 0, 0

    for _ in range(num_episodes):
        state, _ = env.reset()

        while True:
            # Get action from learned policy (if state is known, otherwise pick randomly)
            if state in policy:
                action = np.argmax(list(policy[state].values()))  # Choose action with highest probability
            else:
                action = random.choice([0, 1])  # Random action if state is unseen
            
            next_state, reward, terminated, truncated, _ = env.step(action)

            if terminated or truncated:
                if reward == 1:
                    wins += 1
                elif reward == 0:
                    draws += 1
                else:
                    losses += 1
                break
            
            state = next_state  # Move to next state

    # Compute win, draw, and loss rates
    win_rate = wins / num_episodes
    draw_rate = draws / num_episodes
    loss_rate = losses / num_episodes

    return win_rate, draw_rate, loss_rate

In [8]:
# Evaluate the trained agent
win_rate, draw_rate, loss_rate = evaluate_policy(env, policy, 10000)

print(f"Win Rate: {win_rate:.2%}")
print(f"Draw Rate: {draw_rate:.2%}")
print(f"Loss Rate: {loss_rate:.2%}")

Win Rate: 28.85%
Draw Rate: 4.33%
Loss Rate: 66.82%
