In [1]:
!pip install pygame pyvirtualdisplay

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl.metadata (943 bytes)
Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0


In [2]:
!apt install swig && pip install gymnasium box2d box2d-kengz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 3s (442 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 126319 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubunt

In [3]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display, HTML
import base64
import os

In [4]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
print(env.observation_space)
print(env.action_space)

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Discrete(2)


In [5]:
env.observation_space.shape

(4,)

In [6]:
obs_space_low_ar = env.observation_space.low
obs_space_high_ar = env.observation_space.high
arrays = []
for i in range(len(obs_space_low_ar)):
    arrays.append([obs_space_low_ar[i], obs_space_high_ar[i]])
obs_bounds = np.array(arrays)
print(obs_bounds)


[[-4.8         4.8       ]
 [       -inf         inf]
 [-0.41887903  0.41887903]
 [       -inf         inf]]


In [7]:
practical_state_bounds = np.array([[-4.8, 4.8], [-10, 10], [-0.41887903, 0.41887903], [-10, 10]])

In [8]:
import numpy as np

class TileCoder:
    def __init__(self, num_tilings, tiles_per_dim, state_bounds):
        self.num_tilings = num_tilings
        self.tiles_per_dim = tiles_per_dim
        self.state_bounds = state_bounds  # shape: (D, 2), where D = state dimensions

        self.state_dim = len(state_bounds)
        self.tile_width = (self.state_bounds[:, 1] - self.state_bounds[:, 0]) / (tiles_per_dim - 1)
        self.offsets = [np.linspace(0, self.tile_width[d], num_tilings) for d in range(self.state_dim)]

        self.total_tiles = (tiles_per_dim ** self.state_dim) * num_tilings

    def get_features(self, state):
        """
        Return binary feature vector for given state only (not action)
        """
        features = np.zeros(self.total_tiles)
        for tiling in range(self.num_tilings):
            coords = []
            for i in range(self.state_dim):
                offset = self.offsets[i][tiling]
                coord = int((state[i] - self.state_bounds[i][0] + offset) / self.tile_width[i])
                coords.append(coord)

            flat_index = self._tile_index(tiling, coords)
            features[flat_index] = 1

        return features.reshape(-1)

    def _tile_index(self, tiling, coords):
        """
        Compute flattened index in the feature vector (no action)
        """
        index = 0
        for c in coords:
            index = index * self.tiles_per_dim + c
        index += tiling * (self.tiles_per_dim ** self.state_dim)
        return index
tile_coder = TileCoder(num_tilings=8, tiles_per_dim=8, state_bounds=practical_state_bounds)

The action space is discrete with actions taking on values of 0 or 1 . The observation space is a continuous space in R^4. The second and fourth dimensions are unbounded but I will be clipping them.  This notebook is an implementation of the REINFORCE with Baseline algorithm. The baseline is a function approximatation of the state value using a linear function. The parameterized policy will be a Neural Network.

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

alpha_sv_estimates = .02
alpha_policy_updates = .015
gamma = .99 #discount factor on future rewards
num_episodes = 15000
dim = tile_coder.total_tiles
wts = np.random.normal(size=dim)
num_actions = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, action_dim)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        logits = self.fc2(x)
        probs = F.softmax(logits, dim=-1)
        return probs

def get_features(s):
  clipped_state = np.clip(s, practical_state_bounds[:, 0], practical_state_bounds[:, 1])
  return tile_coder.get_features(clipped_state)

def gradient_descent_sv_weights(w, G, s):
  features = get_features(s)
  w += alpha_sv_estimates * (G - np.dot(w, features)) * features
  return w

def select_action(policy_net, state):
    features = get_features(state)
    input = torch.tensor(features, dtype=torch.float32).to(device)
    probs = policy_net(input)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    return action.item(), log_prob

def get_log_prob(s, a):
  features = get_features(s)
  input = torch.tensor(features, dtype=torch.float32)
  probs = policy_network(input)
  m = torch.distributions.Categorical(probs)
  log_prob = m.log_prob(torch.tensor(a, dtype=torch.long))
  return log_prob

policy_network = PolicyNetwork(dim, num_actions).to(device)
optimizer = torch.optim.Adam(policy_network.parameters(), lr=alpha_policy_updates)


In [18]:
'''for episode_idx in range(num_episodes):
  rewards = []
  actions = []
  states = []
  obs, _ = env.reset() #reset the env to get initial state
  undiscounted_reward_sum = 0
  done = False
  while not done:
      states.append(obs)
      action, log_prob = select_action(policy_network, obs)
      obs, reward, terminated, truncated, _ = env.step(action)
      undiscounted_reward_sum += reward
      rewards.append(reward)
      actions.append(action)
      done = terminated or truncated
  if episode_idx % 100 == 0:
    print(undiscounted_reward_sum)
  for i in range(len(states)):
    #loop thru each state
    cur_state = states[i]
    cur_reward = 0
    for j in range(i, len(rewards)):
      cur_reward += gamma**(j-i) * rewards[j]

    optimizer.zero_grad()
    loss = -1 * get_log_prob(cur_state, actions[i]) * (cur_reward - np.dot(wts, get_features(cur_state)))
    loss.backward()
    optimizer.step()
    wts = gradient_descent_sv_weights(wts, cur_reward, cur_state)


env.close()'''


20.0
9.0


KeyboardInterrupt: 

In [None]:
for episode_idx in range(num_episodes):
    rewards = []
    actions = []
    states = []
    log_probs = []

    obs, _ = env.reset()
    undiscounted_reward_sum = 0
    done = False

    while not done:
        states.append(obs)
        action, log_prob = select_action(policy_network, obs)
        obs, reward, terminated, truncated, _ = env.step(action)

        undiscounted_reward_sum += reward
        rewards.append(reward)
        actions.append(action)
        log_probs.append(log_prob)

        done = terminated or truncated

    if episode_idx % 100 == 0:
        print(undiscounted_reward_sum)

    # Compute discounted returns
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)

    # Compute advantages and update baseline weights
    advantages = []
    for i in range(len(states)):
        cur_state = states[i]
        cur_return = returns[i]
        feat_np = get_features(cur_state)
        baseline = np.dot(wts, feat_np)
        adv = cur_return - baseline
        advantages.append(adv)

        # Update linear baseline (on CPU)
        wts = gradient_descent_sv_weights(wts, cur_return, cur_state)

    # Convert to GPU tensors
    log_probs = torch.stack(log_probs).to(device)
    advantages = torch.tensor(advantages, dtype=torch.float32, device=device)

    # Normalize advantages for stability (optional but helpful)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # Compute batched policy loss and backprop
    loss = -(log_probs * advantages).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


18.0
10.0
10.0
11.0
10.0
9.0
10.0
9.0
8.0
10.0
10.0
9.0
10.0
9.0
8.0
11.0
9.0
10.0
10.0
10.0
10.0
9.0
8.0
8.0
8.0
11.0
8.0
10.0
10.0
9.0
9.0
9.0
9.0
9.0
10.0
10.0
10.0
10.0
11.0
9.0
10.0
8.0
9.0
9.0
11.0
9.0
9.0
10.0
10.0
8.0
9.0
10.0
9.0
9.0
10.0
10.0
10.0
10.0
10.0
8.0
9.0
10.0
10.0
10.0
9.0
8.0
10.0
10.0
10.0
9.0
17.0
18.0
41.0
24.0
22.0
28.0
29.0
26.0
28.0
28.0
41.0
53.0
39.0
22.0
23.0
54.0
49.0
34.0
18.0
25.0
22.0
17.0
49.0
51.0
44.0
61.0
41.0
39.0
32.0
94.0
105.0
56.0
67.0
52.0
92.0
96.0
89.0
104.0
34.0
100.0
111.0
101.0
93.0
96.0
167.0
171.0
92.0
110.0
98.0
96.0
102.0
98.0
118.0
102.0
106.0
104.0
102.0
88.0
101.0
110.0
106.0
110.0
89.0
88.0
87.0
98.0
114.0
104.0
108.0
122.0
112.0


In [13]:
rewards = []
for episode_idx in range(5):
    obs, _ = env.reset()
    reward_sum = 0
    done = False

    while not done:
        action, log_prob = select_action(policy_network, obs)
        obs, reward, terminated, truncated, _ = env.step(action)

        reward_sum += reward

        done = terminated or truncated

    rewards.append(reward_sum)

print(rewards)


[114.0, 106.0, 118.0, 132.0, 110.0]


In [15]:
def select_greedy_action(policy_net, state):
    features = get_features(state)
    input = torch.tensor(features, dtype=torch.float32).to(device)
    probs = policy_net(input)
    m = torch.distributions.Categorical(probs)
    return m.probs.argmax().item()

rewards = []
for episode_idx in range(25):
    obs, _ = env.reset()
    reward_sum = 0
    done = False

    while not done:
        action = select_greedy_action(policy_network, obs)
        obs, reward, terminated, truncated, _ = env.step(action)

        reward_sum += reward

        done = terminated or truncated

    rewards.append(reward_sum)

print(rewards)

[86.0, 104.0, 98.0, 93.0, 102.0, 110.0, 90.0, 120.0, 92.0, 114.0, 112.0, 102.0, 96.0, 106.0, 106.0, 100.0, 110.0, 93.0, 104.0, 106.0, 100.0, 86.0, 114.0, 116.0, 100.0]
