In [1]:
# Reinforcement Learning Code Examples


print('This Python file contains three key reinforcement learning examples highlighting on-policy vs off-policy learning and experience replay usage:')

print('\
1) Q-learning (Off-policy):')
print('   - Behavior policy is epsilon-greedy (explores)')
print('   - Update uses greedy max Q (target policy)')
print('   - Shows the separation of behavior and target policy in Q-learning')

print('\
2) SARSA (On-policy):')
print('   - Behavior and target policy both epsilon-greedy')
print('   - Update uses Q-value of the next action taken (following behavior policy)')
print('   - Demonstrates on-policy learning where sample and update policy are the same')

print('\
3) Q-learning with Experience Replay Buffer:')
print('   - Stores exploratory transitions in a replay buffer')
print('   - Samples batches randomly for training updates')
print('   - Illustrates off-policy training with replay and improved sample efficiency')

print('\
Run each example independently (uncomment the corresponding block) to observe behavior and Q-table evolution.')

This Python file contains three key reinforcement learning examples highlighting on-policy vs off-policy learning and experience replay usage:
1) Q-learning (Off-policy):
   - Behavior policy is epsilon-greedy (explores)
   - Update uses greedy max Q (target policy)
   - Shows the separation of behavior and target policy in Q-learning
2) SARSA (On-policy):
   - Behavior and target policy both epsilon-greedy
   - Update uses Q-value of the next action taken (following behavior policy)
   - Demonstrates on-policy learning where sample and update policy are the same
3) Q-learning with Experience Replay Buffer:
   - Stores exploratory transitions in a replay buffer
   - Samples batches randomly for training updates
   - Illustrates off-policy training with replay and improved sample efficiency
Run each example independently (uncomment the corresponding block) to observe behavior and Q-table evolution.


In [4]:
# === Example: Q Learning Simple ===
# Simple Q-learning example demonstrating the off-policy nature: behavior policy is epsilon-greedy, update uses greedy max Q.

import numpy as np

# Simple 2-state, 2-action environment setup
states = [0, 1]
actions = [0, 1]

# Initialize Q-table: Q[state, action]
Q = np.zeros((len(states), len(actions)))

# Parameters
alpha = 0.1    # Learning rate
gamma = 0.9    # Discount factor
epsilon = 0.3  # Exploration rate for behavior policy

# Behavior policy: Epsilon-greedy
def epsilon_greedy_policy(state):
    if np.random.rand() < epsilon:
        # Explore: choose random action
        return np.random.choice(actions)
    else:
        # Exploit: choose greedy action
        return np.argmax(Q[state])

# Target policy: Greedy
def greedy_policy(state):
    return np.argmax(Q[state])

# Environment step: Deterministic transition and reward for simplicity
def env_step(state, action):
    next_state = 1 - state       # Flip between states 0 and 1
    reward = 1 if action == 1 else 0  # Reward only for action 1
    return next_state, reward

np.random.seed(42)  # For reproducibility
state = 0

for step in range(10):
    # Behavior policy selects an action (epsilon-greedy)
    action = epsilon_greedy_policy(state)
    next_state, reward = env_step(state, action)

    # Q-learning update uses target policy (greedy) for bootstrapping:
    target = reward + gamma * np.max(Q[next_state])
    Q[state, action] += alpha * (target - Q[state, action])

    print(f"Step {step + 1}:")
    print(f" Behavior policy action (epsilon-greedy): {action}")
    print(f" Next state: {next_state}, Reward: {reward}")
    print(f" Updated Q[{state},{action}]: {Q[state, action]:.3f}")
    print(f" Current Q table:\n{Q}")

    state = next_state

print("Final Q-table:")
print(Q)
print("\
Behavior policy actions are epsilon-greedy while Q updates bootstrapped from greedy target policy (max Q).")

Step 1:
 Behavior policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 2:
 Behavior policy action (epsilon-greedy): 0
 Next state: 0, Reward: 0
 Updated Q[1,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 3:
 Behavior policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 4:
 Behavior policy action (epsilon-greedy): 0
 Next state: 0, Reward: 0
 Updated Q[1,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 5:
 Behavior policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 6:
 Behavior policy action (epsilon-greedy): 0
 Next state: 0, Reward: 0
 Updated Q[1,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 7:
 Behavior policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 8:
 Behavior policy action (e

In [8]:
# === Example: Sarsa Simple ===
# Simple SARSA example demonstrating on-policy nature: behavior and target policy are the same (epsilon-greedy).

import numpy as np

# Example environment: 2 states, 2 actions
states = [0, 1]
actions = [0, 1]
Q = np.zeros((len(states), len(actions)))

# Hyperparameters
alpha = 0.1    # learning rate
gamma = 0.9    # discount factor
epsilon = 0.3  # exploration rate

def epsilon_greedy_policy(state):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)  # Explore
    else:
        return np.argmax(Q[state])        # Exploit

# Simple environment step: switches states, reward for action 1
def env_step(state, action):
    next_state = 1 - state
    reward = 1 if action == 1 else 0
    return next_state, reward

np.random.seed(42)
state = 0
action = epsilon_greedy_policy(state)

for step in range(10):
    next_state, reward = env_step(state, action)
    next_action = epsilon_greedy_policy(next_state)

    # SARSA update uses Q-value of next action actually taken (on-policy):
    target = reward + gamma * Q[next_state, next_action]
    Q[state, action] += alpha * (target - Q[state, action])

    print(f"Step {step + 1}:")
    print(f" Behavior (and target) policy action (epsilon-greedy): {action}")
    print(f" Next state: {next_state}, Reward: {reward}")
    print(f" Next action chosen: {next_action}")
    print(f" Updated Q[{state},{action}]: {Q[state, action]:.3f}")
    print(f" Current Q table:\n{Q}")

    state = next_state
    action = next_action

print("Final Q-table:")
print(Q)
print("\
In SARSA, the behavior policy and target policy are the same (epsilon-greedy), demonstrating on-policy learning.")

Step 1:
 Behavior (and target) policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Next action chosen: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 2:
 Behavior (and target) policy action (epsilon-greedy): 0
 Next state: 0, Reward: 0
 Next action chosen: 0
 Updated Q[1,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 3:
 Behavior (and target) policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Next action chosen: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 4:
 Behavior (and target) policy action (epsilon-greedy): 0
 Next state: 0, Reward: 0
 Next action chosen: 0
 Updated Q[1,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 5:
 Behavior (and target) policy action (epsilon-greedy): 0
 Next state: 1, Reward: 0
 Next action chosen: 0
 Updated Q[0,0]: 0.000
 Current Q table:
[[0. 0.]
 [0. 0.]]
Step 6:
 Behavior (and target) policy action (epsilon-greedy): 0
 Next state: 0, Reward: 0
 Next action chosen: 0
 Updated Q[1,0]:

In [11]:
# === Example: Q Learning With Replay ===
# Q-learning example with experience replay buffer showing how stored exploratory experience supports off-policy updates and improves sample efficiency.

import numpy as np
import random

# 2 states, 2 actions
states = [0, 1]
actions = [0, 1]
Q = np.zeros((len(states), len(actions)))

# Hyperparameters
alpha = 0.1    # learning rate
gamma = 0.9    # discount factor
epsilon = 0.3  # exploration rate

# Replay buffer settings
replay_buffer = []
buffer_capacity = 100
batch_size = 4

def epsilon_greedy_policy(state):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)  # Explore
    else:
        return np.argmax(Q[state])        # Exploit

def env_step(state, action):
    next_state = 1 - state
    reward = 1 if action == 1 else 0
    return next_state, reward

np.random.seed(42)
state = 0

def replay_update():
    if len(replay_buffer) < batch_size:
        return  # Not enough samples yet
    batch = random.sample(replay_buffer, batch_size)
    for (s, a, r, s_next) in batch:
        target = r + gamma * np.max(Q[s_next])
        Q[s, a] += alpha * (target - Q[s, a])

for step in range(30):
    # 1. Use behavior policy (epsilon-greedy) for action selection
    action = epsilon_greedy_policy(state)
    next_state, reward = env_step(state, action)

    # 2. Store transition in replay buffer
    if len(replay_buffer) == buffer_capacity:
        replay_buffer.pop(0)  # Remove oldest if buffer full
    replay_buffer.append((state, action, reward, next_state))

    # 3. Sample a batch from replay buffer to update Q
    replay_update()

    if (step + 1) % 5 == 0:
        print(f"Step {step + 1}, Q-table:\n{Q}")

    state = next_state

print("Final Q-table after training with replay buffer:")
print(Q)

Step 5, Q-table:
[[0. 0.]
 [0. 0.]]
Step 10, Q-table:
[[0. 0.]
 [0. 0.]]
Step 15, Q-table:
[[0.    0.1  ]
 [0.009 0.109]]
Step 20, Q-table:
[[0.06965733 0.289639  ]
 [0.15582401 0.22416751]]
Step 25, Q-table:
[[0.24651764 0.56210171]
 [0.29521963 0.55165219]]
Step 30, Q-table:
[[0.50691331 0.97076309]
 [0.40317376 1.01129603]]
Final Q-table after training with replay buffer:
[[0.50691331 0.97076309]
 [0.40317376 1.01129603]]
