In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

# Set random seeds for reproducibility
np.random.seed(42)

# Create the CartPole environment
env = gym.make('CartPole-v1', render_mode='human')

# Hyperparameters
alpha = 0.1   # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
num_episodes = 2000
num_bins = (16, 12, 16, 12)  # Refined discretization bins for state space

# Function to discretize the continuous state space
def discretize_state(state, num_bins):
    state_array = state if isinstance(state, np.ndarray) else state[0]
    
    cart_pos_bins = np.linspace(-2.4, 2.4, num_bins[0])
    cart_vel_bins = np.linspace(-3.0, 3.0, num_bins[1])
    pole_angle_bins = np.linspace(-0.2, 0.2, num_bins[2])
    pole_vel_bins = np.linspace(-3.0, 3.0, num_bins[3])

    cart_pos_bin = np.digitize(state_array[0], cart_pos_bins) - 1
    cart_vel_bin = np.digitize(state_array[1], cart_vel_bins) - 1
    pole_angle_bin = np.digitize(state_array[2], pole_angle_bins) - 1
    pole_vel_bin = np.digitize(state_array[3], pole_vel_bins) - 1

    return (cart_pos_bin, cart_vel_bin, pole_angle_bin, pole_vel_bin)

# Initialize Q-table (value function)
num_actions = env.action_space.n
num_states = (num_bins[0] + 1, num_bins[1] + 1, num_bins[2] + 1, num_bins[3] + 1, num_actions)
Q = np.random.uniform(low=-1, high=1, size=num_states)
# Q = np.zeros(num_bins + (num_actions,))

# Function to choose action based on epsilon-greedy policy
def choose_action(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Explore action space
    else:
        return np.argmax(Q[state])  # Exploit learned values

# Function to update Q-table based on TD(0)
def update_Q(state, action, reward, next_state, alpha, gamma):
    current_estimate = Q[state][action]
    best_future_estimate = np.max(Q[next_state])
    td_target = reward + gamma * best_future_estimate
    td_error = td_target - current_estimate
    Q[state][action] += alpha * td_error

# Initialize variables for training
total_rewards = []

# Training loop
for episode in range(num_episodes):
    state = env.reset()
    state = discretize_state(state, num_bins)

    done = False
    episode_reward = 0

    while not done:
        action = choose_action(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state, num_bins)
        episode_reward += reward

        update_Q(state, action, reward, next_state, alpha, gamma)
        state = next_state

    total_rewards.append(episode_reward)

    # Decay epsilon for exploration-exploitation trade-off
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

env.close()

# Plot total rewards per episode
plt.figure()
plt.plot(total_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Rewards per Episode')
plt.show()

# Check if the agent has learned to balance the pole
if max(total_rewards) >= 200:
    print("The agent has learned to balance the pole.")
else:
    print("The agent has not yet learned to balance the pole.")