SGD

In [3]:
import numpy as np

# Define the states in a simple grid world
states = np.array([0, 1, 2, 3, 4])  # 5 states

# Define feature vectors for each state (for simplicity, we'll use the state index as the feature)
def feature_vector(state):
    return np.array([state])

# Initialize weights randomly
initial_weights = np.random.randn(1)

# Set the true value of each state (for demonstration purposes)
true_values = np.array([0, 1, 2, 3, 4])  # True value is simply the state index

# Define the learning rate
alpha = 0.01

# Function to compute value estimate
def value_estimate(state, weights):
    return np.dot(weights, feature_vector(state))


### Batch Gradient Descent ###
print("\nBatch Gradient Descent\n" + "-"*25)

weights = initial_weights.copy()
print(f"Initial Weights: {weights}")

# Perform Gradient Descent update for multiple iterations
n_iterations = 10  # Number of iterations
for iteration in range(n_iterations):
    gradient_sum = np.zeros_like(weights)
    for state in states:
        estimate = value_estimate(state, weights)
        true_value = true_values[state]
        error = true_value - estimate
        gradient = feature_vector(state)
        gradient_sum += error * gradient

    # Update weights
    weights += alpha * gradient_sum

    print(f"Weights after iteration {iteration + 1}: {weights}")


### Stochastic Gradient Descent (SGD) ###
print("\nStochastic Gradient Descent (SGD)\n" + "-"*40)

weights = initial_weights.copy()
print(f"Initial Weights: {weights}")

# Perform Gradient Descent updates for each state individually (SGD) for multiple iterations
n_iterations = 10  # Number of iterations
for iteration in range(n_iterations):
    for state in states:
        estimate = value_estimate(state, weights)
        true_value = true_values[state]
        error = true_value - estimate
        gradient = feature_vector(state)

        # Update weights
        weights += alpha * error * gradient

    print(f"Weights after iteration {iteration + 1}: {weights}")


### Mini-Batch Gradient Descent ###
print("\nMini-Batch Gradient Descent\n" + "-"*35)

weights = initial_weights.copy()
print(f"Initial Weights: {weights}")

batch_size = 2  # Define the size of the mini-batch

# Perform Gradient Descent updates for mini-batches for multiple iterations
n_iterations = 10  # Number of iterations
for iteration in range(n_iterations):
    for i in range(0, len(states), batch_size):
        gradient_sum = np.zeros_like(weights)
        mini_batch_states = states[i:i + batch_size]

        for state in mini_batch_states:
            estimate = value_estimate(state, weights)
            true_value = true_values[state]
            error = true_value - estimate
            gradient = feature_vector(state)
            gradient_sum += error * gradient

        # Update weights
        weights += alpha * gradient_sum

    print(f"Weights after iteration {iteration + 1}: {weights}")



Batch Gradient Descent
-------------------------
Initial Weights: [1.75781712]
Weights after iteration 1: [1.53047199]
Weights after iteration 2: [1.37133039]
Weights after iteration 3: [1.25993127]
Weights after iteration 4: [1.18195189]
Weights after iteration 5: [1.12736632]
Weights after iteration 6: [1.08915643]
Weights after iteration 7: [1.0624095]
Weights after iteration 8: [1.04368665]
Weights after iteration 9: [1.03058065]
Weights after iteration 10: [1.02140646]

Stochastic Gradient Descent (SGD)
----------------------------------------
Initial Weights: [1.75781712]
Weights after iteration 1: [1.55054335]
Weights after iteration 2: [1.3999619]
Weights after iteration 3: [1.29056663]
Weights after iteration 4: [1.21109252]
Weights after iteration 5: [1.15335571]
Weights after iteration 6: [1.11141074]
Weights after iteration 7: [1.08093831]
Weights after iteration 8: [1.05880053]
Weights after iteration 9: [1.04271775]
Weights after iteration 10: [1.03103384]

Mini-Batch Gr

In [None]:
import numpy as np

# Assume we have an environment simulator and a policy π
# For simplicity, let's define a simple environment and policy
def simple_policy(state):
    # A dummy policy that always takes the same action
    return 0  # Always take action 0

def environment_step(state, action):
    # A dummy environment transition, returns next_state, reward, done
    next_state = state + 1
    reward = 1.0  # Constant reward
    done = next_state == 10  # End the episode at state 10
    return next_state, reward, done

# Parameters
alpha = 0.01  # Learning rate
num_episodes = 100  # Number of episodes to run
state_dim = 1  # Dimension of state feature vector

# Initialize value function weights (w) as appropriate
w = np.zeros(state_dim)  # Start with w = 0

# Feature function for state (simple linear feature in this case)
def feature_function(state):
    return np.array([state])

# Main loop: Repeat for each episode
for episode in range(num_episodes):
    # Generate an episode S_0, A_0, R_1, S_1, A_1, ..., R_T, S_T using π
    state = 0  # Start at state 0
    episode_data = []
    while True:
        action = simple_policy(state)
        next_state, reward, done = environment_step(state, action)
        episode_data.append((state, reward))
        if done:
            break
        state = next_state

    # Calculate returns G_t for the episode
    returns = []
    G = 0
    for state, reward in reversed(episode_data):
        G = reward + G  # Monte Carlo return is cumulative reward
        returns.insert(0, G)  # Prepend the return

    # For t = 0, 1, ..., T - 1:
    for t, (state, G_t) in enumerate(zip([s for s, _ in episode_data], returns)):
        # Compute the estimated value v_hat(S_t, w)
        x_t = feature_function(state)
        v_hat = np.dot(w, x_t)

        # Compute the gradient of v_hat with respect to w
        gradient = x_t  # For linear approximation, ∇v_hat is simply the feature vector

        # Update the weights w using the gradient descent update rule
        w += alpha * (G_t - v_hat) * gradient

    # Optionally, print weights after each episode
    print(f"Episode {episode + 1}, Weights: {w}")

# Final value function weights
print("Final weights:", w)


Episode 1, Weights: [0.1588437]
Episode 2, Weights: [0.16077595]
Episode 3, Weights: [0.16079946]
Episode 4, Weights: [0.16079974]
Episode 5, Weights: [0.16079975]
Episode 6, Weights: [0.16079975]
Episode 7, Weights: [0.16079975]
Episode 8, Weights: [0.16079975]
Episode 9, Weights: [0.16079975]
Episode 10, Weights: [0.16079975]
Episode 11, Weights: [0.16079975]
Episode 12, Weights: [0.16079975]
Episode 13, Weights: [0.16079975]
Episode 14, Weights: [0.16079975]
Episode 15, Weights: [0.16079975]
Episode 16, Weights: [0.16079975]
Episode 17, Weights: [0.16079975]
Episode 18, Weights: [0.16079975]
Episode 19, Weights: [0.16079975]
Episode 20, Weights: [0.16079975]
Episode 21, Weights: [0.16079975]
Episode 22, Weights: [0.16079975]
Episode 23, Weights: [0.16079975]
Episode 24, Weights: [0.16079975]
Episode 25, Weights: [0.16079975]
Episode 26, Weights: [0.16079975]
Episode 27, Weights: [0.16079975]
Episode 28, Weights: [0.16079975]
Episode 29, Weights: [0.16079975]
Episode 30, Weights: [0.

In [None]:
import numpy as np

# Define states
states = ['S1', 'S2', 'S3']

# Define rewards for state transitions
rewards = {'S1': 1, 'S2': 2}

# Define transition probabilities (for simplicity, deterministic transitions)
transitions = {
    'S1': 'S2',
    'S2': 'S3'
}

# Initialize value function
V = {state: 0 for state in states}

# Parameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate

# Function to simulate n-step TD
def n_step_td(n, episodes=10):
    for episode in range(episodes):
        state = 'S1'
        t = 0
        states_visited = [state]
        rewards_received = []

        while state != 'S3':
            next_state = transitions[state]
            reward = rewards[state]
            rewards_received.append(reward)
            states_visited.append(next_state)
            state = next_state
            t += 1

            if len(rewards_received) >= n:
                G = sum([gamma**i * rewards_received[i] for i in range(n)])
                if state != 'S3':
                    G += gamma**n * V[state]

                update_state = states_visited[t-n]
                V[update_state] += alpha * (G - V[update_state])

                rewards_received = rewards_received[1:]
                states_visited = states_visited[1:]

        # Final update if episode ends before n steps
        for tau in range(len(rewards_received)):
            G = sum([gamma**i * rewards_received[i] for i in range(tau, len(rewards_received))])
            update_state = states_visited[tau]
            V[update_state] += alpha * (G - V[update_state])

# Run n-step TD for n = 1, 2, 3
print("Initial Value Function:", V)
for n in [1, 2, 3]:
    print(f"\nRunning {n}-step TD...")
    n_step_td(n, episodes=10)
    print(f"Value Function after {n}-step TD:", V)

# Final value function
print("\nFinal Value Function:", V)


Initial Value Function: {'S1': 0, 'S2': 0, 'S3': 0}

Running 1-step TD...
Value Function after 1-step TD: {'S1': 0.6513215599000001, 'S2': 0, 'S3': 1.3026431198000001}

Running 2-step TD...
Value Function after 2-step TD: {'S1': 2.0508021532294305, 'S2': 1.3026431198000001, 'S3': 1.3026431198000001}

Running 3-step TD...
Value Function after 3-step TD: {'S1': 2.5387708634617594, 'S2': 1.6265823788388616, 'S3': 1.3026431198000001}

Final Value Function: {'S1': 2.5387708634617594, 'S2': 1.6265823788388616, 'S3': 1.3026431198000001}
