In [None]:
import gym
import numpy as np


In [None]:
# Create the CartPole environment
env = gym.make("CartPole-v1")

# Reset the environment
state = env.reset()

print("Initial State:", state)


In [None]:
# Display environment details
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)
print("Observation Space High:", env.observation_space.high)
print("Observation Space Low:", env.observation_space.low)


In [None]:
# Simulate random actions
for _ in range(10):
    action = env.action_space.sample()  # Select a random action
    state, reward, done, info = env.step(action)  # Take the action
    print(f"Action: {action}, Reward: {reward}, Done: {done}")

    if done:
        state = env.reset()  # Reset the environment if the episode ends


In [None]:
# Define a simple policy: always move right
def simple_policy(observation):
    return 1  # Always take action 1 (move right)

# Run the policy
state = env.reset()
for _ in range(50):
    action = simple_policy(state)
    state, reward, done, info = env.step(action)
    env.render()  # Render the environment
    if done:
        break

env.close()


In [None]:
# Initialize Q-table
num_states = (10, 10, 10, 10)  # Discretized state space
q_table = np.zeros(num_states + (env.action_space.n,))

# Define parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.99
num_episodes = 500

# Discretize state
def discretize_state(state, bins=(10, 10, 10, 10)):
    high = env.observation_space.high
    low = env.observation_space.low
    bins = [np.linspace(low[i], high[i], bins[i]) for i in range(len(bins))]
    discretized = tuple(np.digitize(state[i], bins[i]) for i in range(len(state)))
    return discretized

# Train Q-learning agent
for episode in range(num_episodes):
    state = discretize_state(env.reset())
    total_reward = 0
    done = False

    while not done:
        if np.random.random() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        next_state, reward, done, _ = env.step(action)
        next_state = discretize_state(next_state)
        total_reward += reward

        # Update Q-table
        q_table[state][action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state][action])
        state = next_state

    epsilon *= epsilon_decay  # Decay exploration rate
    print(f"Episode {episode + 1}, Total Reward: {total_reward}")


In [None]:
# Test the trained policy
state = discretize_state(env.reset())
total_reward = 0
done = False

while not done:
    action = np.argmax(q_table[state])  # Exploit the learned policy
    state, reward, done, _ = env.step(action)
    state = discretize_state(state)
    total_reward += reward
    env.render()

print(f"Total Reward: {total_reward}")
env.close()
