# Please Win Blackjack

**Episodes:** 1,000,000 \
Results:
- Win Rate: 38.42% :(
- Draw Rate: 4.88% :|
- Loss Rate: 56.70% ;(

In [1]:
# Required Libraries
import gymnasium as gym
import numpy as np
import random
from collections import defaultdict
import matplotlib.pyplot as plt

In [5]:
# Function to Initialize the Q-table, policy, and returns
def initialize_mc_es():
    Q = defaultdict(lambda: np.zeros(2)) # Action-value function (2 actions: hit or stick)
    policy = {} # Stores the best action for each state
    returns = defaultdict(list)
    return Q, policy, returns

Q, policy, returns = initialize_mc_es()

In [7]:
# Function to generate a single episode
def generate_episode(env, Q):
    episode = []

    # Exploring Starts: Choose a random (state, action) pair to begin
    state, _ = env.reset() # Resets the environment and get a random initial state
    action = random.choice([0, 1]) # 0 = Stick, 1 = Hit (random first action)

    # Continue untile episode ends
    while True: 
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))

        if terminated or truncated:
            break

        state = next_state
        if state in Q and np.any(Q[state]): # If we have Q-values, use greedy action
            action = np.argmax(Q[state])
        else:
            action = random.choice([0, 1])  # Random action if state is unseen

    return episode

In [6]:
# Monte Carlo ES Update
def update_q_policy(episode, Q, returns, policy):
    G = 0 # Initialize the return

    # Process episode in reverse (backward updates)
    for state, action, reward in reversed(episode):
        G = reward + G # Accumulate Rewards

        # Store the return for this (state, action) pair
        returns[(state, action)].append(G)

        # Update Q-value using the average of all observed returns
        Q[state, action] = np.mean(returns[state, action])

        # Improve policy: Choose action with highest Q-value
        policy[state] = np.argmax(Q[state])

    return Q, policy

In [15]:
def train_mc_es(env, num_episodes=10000):
    Q, policy, returns = initialize_mc_es()

    for episode_num in range(num_episodes):
        episode = generate_episode(env, Q)

        Q, policy = update_q_policy(episode, Q, returns, policy)

         # Print progress occasionally
        if episode_num % 100000 == 0:
            print(f"Episode {episode_num}/{num_episodes} completed.")

    return Q, policy

In [16]:
env = gym.make("Blackjack-v1", natural=True, sab=True)

num_episodes = 1000000

# Train the agent
Q, policy = train_mc_es(env, num_episodes)

print("Training complete!")

Episode 0/1000000 completed.
Episode 100000/1000000 completed.
Episode 200000/1000000 completed.
Episode 300000/1000000 completed.
Episode 400000/1000000 completed.
Episode 500000/1000000 completed.
Episode 600000/1000000 completed.
Episode 700000/1000000 completed.
Episode 800000/1000000 completed.
Episode 900000/1000000 completed.
Training complete!


In [17]:
def evaluate_policy(env, policy, num_episodes=10000):
    wins = 0
    draws = 0
    losses = 0

    for _ in range(num_episodes):
        state, _ = env.reset()

        while True:
            action = policy[state] if state in policy else random.choice([0, 1])

            next_state, reward, terminated, truncated, _ = env.step(action)

            if terminated or truncated:
                if reward == 1:
                    wins += 1
                elif reward == 0:
                    draws += 1
                else:
                    losses += 1
                break

            state = next_state

    win_rate = wins / num_episodes
    draw_rate = draws / num_episodes
    loss_rate = losses / num_episodes

    return win_rate, draw_rate, loss_rate

In [18]:
# Evaluate the trained agent
win_rate, draw_rate, loss_rate = evaluate_policy(env, policy, num_episodes)

print(f"Win Rate: {win_rate:.2%}")
print(f"Draw Rate: {draw_rate:.2%}")
print(f"Loss Rate: {loss_rate:.2%}")

Win Rate: 38.42%
Draw Rate: 4.88%
Loss Rate: 56.70%
