In [38]:
import numpy as np
import gym
import numpy as np
import matplotlib.pyplot as plt

In [39]:
class LSPI:
    def __init__(self, basis_function, discount, state_size=4):
        self.basis_function = basis_function
        self.discount = discount
        # check if the basis function is linear or not
        if isinstance(basis_function, LinearBasisFunction):
            self.w = np.zeros(basis_function.size)  # Initialize the weight vector
        else:
            self.w = np.zeros(basis_function.size * state_size+1)  # Initialize the weight vector

    def Qvalue(self, state, action):
        phi = self.basis_function.evaluate(state, action)
        return np.dot(self.w, phi)

    def policy(self, state):
        # In the CartPole environment, there are only two possible actions: 0 and 1
        q_values = [self.Qvalue(state, a) for a in [0, 1]]
        return np.argmax(q_values)

    def update(self, state, action, reward, next_state):
        # This is a simplified version of the update rule in the provided code
        phi = self.basis_function.evaluate(state, action)
        if next_state is not None:
            q_next = max([self.Qvalue(next_state, a) for a in [0, 1]])
        else:
            q_next = 0
        target = reward + self.discount * q_next
        error = target - self.Qvalue(state, action)
        self.w += 0.01 * error * phi  # Assume a constant learning rate of 0.01


In [40]:
class LinearBasisFunction:
    def __init__(self):
        self.size = 5  # 4 dimensions for the state and 1 for the action

    def evaluate(self, state, action):
        return np.concatenate([state, [action]])

In [41]:
class RadialBasisFunction:
    def __init__(self, num_centers):
        self.size = num_centers
        self.centers = np.linspace(-1, 1, num_centers)

    def evaluate(self, state, action):
        return np.concatenate([np.exp(-np.square(state - c)) for c in self.centers] + [np.array([action])])

In [49]:
# Create the environment
env = gym.make('CartPole-v1', render_mode="rgb_array")

# Create the basis function
basis_function = LinearBasisFunction()
basis_function = RadialBasisFunction(4)

# Initialize LSPI with the basis function and a discount factor
lspi = LSPI(basis_function, discount=0.99)

# Main training loop
num_episodes = 100000
for i_episode in range(num_episodes):
    # Reset the environment and state
    state = env.reset()[0]
    done = False
    reward_sum = 0
    while not done:
        # Select an action
        action = lspi.policy(state)

        # Execute the action
        next_state, reward, done, _, _ = env.step(action)
        next_state = None if done else next_state

        # Update the policy
        lspi.update(state, action, reward, next_state)

        # Update the state
        state = next_state
        reward_sum += reward

    if i_episode % 10000 == 0:
        # render a game
        state = env.reset()[0]
        done = False
        count = 0
        while not done:
            if count > 500:
                break
            env.render()
            action = lspi.policy(state)
            next_state, reward, done, _, _ = env.step(action)
            next_state = None if done else next_state
            state = next_state
        print("count: ", count)
        print("Episode: {}, reward: {}".format(i_episode, reward_sum))
    env.close()

count:  0
Episode: 0, reward: 8.0
count:  0
Episode: 10000, reward: 10.0
count:  0
Episode: 20000, reward: 9.0
count:  0
Episode: 30000, reward: 9.0
count:  0
Episode: 40000, reward: 8.0
count:  0
Episode: 50000, reward: 10.0
count:  0
Episode: 60000, reward: 9.0


KeyboardInterrupt: 