In [None]:
# First, install required packages
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gymnasium pyvirtualdisplay > /dev/null 2>&1

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
import time
import base64
from pyvirtualdisplay import Display

# Set up virtual display for rendering
display = Display(visible=0, size=(800, 600))
display.start()

# Create environment with 'rgb_array' render mode
env = gym.make('CartPole-v1', render_mode='rgb_array')

# Hyperparameters
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 100
NUM_BINS = 20

# Discretization settings
discrete_os_size = [NUM_BINS] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / discrete_os_size

# Initialize Q-table
q_table = np.zeros(discrete_os_size + [env.action_space.n])

# Track rewards
episode_rewards = []

def get_discrete_state(state):
    """Convert continuous state to discrete indices"""
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int32))

def show_animation(frames):
    """Display animation from collected frames"""
    plt.figure(figsize=(8, 6))
    for frame in frames:
        plt.imshow(frame)
        plt.axis('off')
        display.clear_output(wait=True)
        display.display(plt.gcf())
        time.sleep(0.02)
    plt.close()

# Training loop
for episode in range(EPISODES):
    state = env.reset()[0]
    discrete_state = get_discrete_state(state)
    done = False
    total_reward = 0
    frames = []  # To store animation frames

    # Render every 10 episodes
    render = episode % 10 == 0

    while not done:
        action = np.argmax(q_table[discrete_state])

        new_state, reward, terminated, truncated, _ = env.step(action)
        new_discrete_state = get_discrete_state(new_state)
        total_reward += reward
        done = terminated or truncated

        if render:
            frames.append(env.render())

        # Q-learning update
        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action,)]
            new_q = current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q - current_q)
            q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state

    episode_rewards.append(total_reward)
    print(f"Episode: {episode} Reward: {total_reward}")

    # Show animation for this episode if rendered
    if render and frames:
        show_animation(frames)

env.close()

# Plot learning progress
plt.figure(figsize=(10, 6))
plt.plot(episode_rewards)
plt.title('CartPole Q-Learning Performance')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'Xvfb'