In [2]:
%pip install tmrl

Collecting tmrl
  Downloading tmrl-0.7.1-py3-none-any.whl.metadata (2.7 kB)
Collecting rtgym>=0.13 (from tmrl)
  Downloading rtgym-0.16-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyautogui (from tmrl)
  Downloading PyAutoGUI-0.9.54.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pyinstrument (from tmrl)
  Downloading pyinstrument-5.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (26 kB)
Collecting tlspyo>=0.2.5 (from tmrl)
  Downloading tlspyo-0.3.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
INFO: pip is looking at multiple ve

In [3]:
import tmrl
import time
import matplotlib.pyplot as plts
import numpy as np
import torch
import torch.nn as nn

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(device)

'Tesla T4'

In [5]:
env = tmrl.get_environment()
print('Observation Space:\t', env.observation_space)
print('Action Space:\t\t', env.action_space)
observation_space = np.sum([np.prod(value.shape) for value in env.observation_space])
action_space = env.action_space.shape[0]
print('observation_space logits:', observation_space)
print('action_space logits:\t', action_space)

Observation Space:	 Tuple(Box(0.0, 1000.0, (1,), float32), Box(0.0, 6.0, (1,), float32), Box(0.0, inf, (1,), float32), Box(0.0, 255.0, (4, 64, 64), float32), Box(-1.0, 1.0, (3,), float32), Box(-1.0, 1.0, (3,), float32))
Action Space:		 Box(-1.0, 1.0, (3,), float32)
observation_space logits: 16393
action_space logits:	 3


In [6]:
hyper_params = {'policy_lr': 1e-5,
                'critic_lr': 1e-5,
                'gamma': 0.996,
                'clip_coef': 0.2,
                'critic_coef': 0.1,
                'entropy_coef': 0.1,
                'batch_size': 256,
                'num_updates': 10000,
                'epochs_per_update': 100,
                'hidden_dim':512,
                'max_episode_steps': 2400,
                'norm_advantages': True,
                'grad_clip_val': 0.1,
                'initial_std': 1,
                'avg_ray': 400}

In [9]:
class Policy (nn.Module):
    def __init__(self):
        super().__init__()
        self.action_mean = nn.Sequential(
            #nn.LayerNorm(observation_space),
            #nn.BatchNorm1d(observation_space),
            nn.Linear(observation_space, hyper_params['hidden_dim']),
            nn.ReLU(),
            nn.Linear(hyper_params['hidden_dim'],hyper_params['hidden_dim']),
            nn.ReLU(),
            nn.Linear(hyper_params['hidden_dim'], action_space),
            nn.Tanh()
        )

        self.actor_logvar = nn.Sequential(
            #nn.LayerNorm(observation_space),
            #nn.BatchNorm1d(observation_space),
            nn.Linear(observation_space, hyper_params['hidden_dim']),
            nn.ReLU(),
            nn.Linear(hyper_params['hidden_dim'],hyper_params['hidden_dim']),
            nn.ReLU(),
            nn.Linear(hyper_params['hidden_dim'], 1)
        )

    def sample_action_with_logprobs(self, observation):
        dist = self(observation)
        sample_action = dist.sample()
        return sample_action, dist.log_prob(sample_action)
    
    def mean_only(self, observation):
        with torch.no_grad():
            return self.action_mean(observation)
    
    def get_action_log_prob(self, observation, action):
        dist = self(observation)
        return dist.log_prob(action)
    
    def forward(self, observation):
        observation /= hyper_params['avg_ray']
        means = self.action_mean(observation)
        vars = torch.zeros(observation.shape[0], action_space).to(device)
        vars[:,:] = self.actor_logvar(observation).exp().view(-1,1)
        covar_mat = torch.zeros(observation.shape[0], action_space, action_space).to(device)
        covar_mat[:,np.arange(action_space), np.arange(action_space)] = vars

        dist = torch.distributions.MultivariateNormal(means, covar_mat)
        return dist
        
class Critic (nn.Module):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            #nn.LayerNorm(observation_space),
            #nn.BatchNorm1d(observation_space),
            nn.Linear(observation_space, hyper_params['hidden_dim']),
            nn.ReLU(),
            nn.Linear(hyper_params['hidden_dim'],hyper_params['hidden_dim']),
            nn.ReLU(),
            nn.Linear(hyper_params['hidden_dim'], 1)
        )
    def forward(self, observation):
        observation /= hyper_params['avg_ray']
        return self.network(observation)
    
class Agent(nn.Module):
    def __init__(self):
        super().__init__()
        self.policy = Policy()
        self.critic = Critic()
    def forward(self, x):
        raise SyntaxError('Propagate through Agent.policy \
                          and Agent.critic individually')

In [10]:
def env_obs_to_tensor(observations):
    tensors = [torch.tensor(observation).view(-1) for observation in observations]
    return torch.cat(tuple(tensors), dim=-1)

def env_act_to_tensor(action):
    return torch.tensor(action)

In [11]:
agent = Agent().to(device)
policy_optim = torch.optim.Adam(agent.policy.parameters(), lr=hyper_params['policy_lr'])
critic_optim = torch.optim.Adam(agent.critic.parameters(), lr=hyper_params['critic_lr'])
#agent.load_state_dict(torch.load('130.33999633789062RewardRacer56Update.pt'))

In [None]:
def train_PPO():
    """
    Main PPO training loop that collects experience, computes advantages, and optimizes policy.
    
    PPO (Proximal Policy Optimization) is an on-policy algorithm that:
    1. Collects trajectories using the current policy
    2. Computes advantages (how much better actions were than expected)
    3. Updates policy using clipped surrogate objective to prevent large updates
    4. Updates value function to better estimate returns
    """
    # Lists to track training progress over all updates
    cum_rewards = []
    actor_losses = []
    critic_losses = []
    total_losses = []

    cum_reward = 0
    
    # Main training loop - each update is one complete episode
    for update in range(hyper_params['num_updates']):        
        
        # ==================== PHASE 1: DATA COLLECTION (ROLLOUT) ====================
        # Buffers to store experience from one complete episode
        obs = torch.zeros(hyper_params['max_episode_steps'], observation_space)
        actions = torch.zeros(hyper_params['max_episode_steps'], action_space)
        logprobs = torch.zeros(hyper_params['max_episode_steps'])  # Log probability of each action
        rewards = torch.zeros(hyper_params['max_episode_steps'])
        state_values = torch.zeros(hyper_params['max_episode_steps'])  # Critic's value estimates
        returns = torch.zeros(hyper_params['max_episode_steps'])  # Will store actual cumulative rewards
        
        # TODO: Add learning rate scheduler if needed
        
        # Reset the environment before each new episode
        next_obs = env_obs_to_tensor(env.reset()[0])  # Extract observation from (obs, info) tuple
        
        # TMRL-specific: Manually click game window to ensure focus when training
        # Uncomment if needed for first update
        #if update == 0:
            #time.sleep(1.0)
        
        max_idx = 0  # Track actual episode length (may be less than max_episode_steps)
        was_terminated = False  # Track if episode ended naturally vs. truncated
        
        # Set agent to evaluation mode (no gradient tracking during rollout)
        agent.eval()
        
        # Collect experience for one complete episode
        for step in range(hyper_params['max_episode_steps']):
            obs[step] = next_obs

            # Sample action from current policy without gradient tracking
            with torch.no_grad():
                # Get action and its log probability from stochastic policy
                action, logprob = agent.policy.sample_action_with_logprobs(next_obs.to(device).unsqueeze(0))
                # Get critic's estimate of state value (for advantage calculation later)
                state_value = agent.critic(next_obs.to(device).unsqueeze(0))
                
            # Store action, log probability, and value estimate
            actions[step] = action[0]
            logprobs[step] = logprob[0]
            state_values[step] = state_value[0]

            # Clip actions to valid range since Gaussian sampling can produce out-of-bounds values
            clamped_action = np.clip(np.array(action.cpu()), -1, 1)

            # Execute action in environment
            next_obs, reward, terminated, truncated, info = env.step(clamped_action[0])

            # Custom termination condition: End episode if car gets stuck on rail
            # (detected by low LIDAR readings)
            if next_obs[2][next_obs[2] <= 40].sum() > 0:
                terminated = True
            
            # Store reward and convert next observation to tensor
            rewards[step] = torch.tensor(reward)
            next_obs = env_obs_to_tensor(next_obs)
            
            # Break if episode ended
            if terminated or truncated:
                was_terminated = True
                max_idx = step
                break
                
        # Pause environment (TMRL-specific requirement)
        env.unwrapped.wait() # type: ignore
        
        max_idx = step  # Store final step index
        
        # ==================== PHASE 2: COMPUTE RETURNS & ADVANTAGES ====================
        # Calculate discounted cumulative rewards (returns) working backwards from end
        with torch.no_grad():
            for t in range(max_idx + 1)[::-1]:  # Iterate backwards from last step
                if t == (max_idx):
                    if not was_terminated:
                        # If episode was truncated (not naturally ended), bootstrap with critic
                        # Currently disabled - only using immediate reward
                        returns[t] = rewards[t]  # + (hyper_params['gamma']*agent.critic(next_obs.to(device)))
                    else:
                        # Terminal state has no future rewards
                        returns[t] = rewards[t]
                else:
                    # Standard discounted return: R_t = r_t + gamma * R_{t+1}
                    returns[t] = rewards[t] + (hyper_params['gamma'] * returns[t+1])
                    
            # Compute advantages: A = actual_return - predicted_value
            # Positive advantage means action was better than expected
            advantages = returns - state_values
            cum_reward = rewards.sum().item()
            
        # Save model checkpoint if performance is good (reward > 200)
        if cum_reward > 200:
            torch.save(agent.state_dict(), f'Y{cum_reward:.2f}RewardRacer{update}Update_2.pt')
        
        # ==================== PHASE 3: POLICY OPTIMIZATION ====================
        # Randomly shuffle indices for mini-batch training
        rand_idxs = np.random.permutation(np.arange(max_idx + 1))
        
        # Track losses for each epoch
        epochs_values_loss = []
        epochs_ppo_loss = []
        epochs_total_loss = []
        
        # Set agent to training mode (enable gradient tracking)
        agent.train()
        
        # Train on collected data for multiple epochs
        for epoch in range(hyper_params['epochs_per_update']):
            # Process data in mini-batches
            for batch_start_idx in range(0, max_idx, hyper_params['batch_size']):
                batch_end_idx = batch_start_idx + hyper_params['batch_size']
                batch_idxs = rand_idxs[batch_start_idx:batch_end_idx]

                # Extract batch data
                batch_obs = obs[batch_idxs]
                batch_actions = actions[batch_idxs]

                # ========== PPO CLIPPED SURROGATE OBJECTIVE ==========
                # Get new log probabilities under current (updated) policy
                batch_new_log_probs = agent.policy.get_action_log_prob(batch_obs.to(device), batch_actions.to(device))
                # Get old log probabilities from when actions were sampled
                batch_old_log_probs = logprobs[batch_idxs].to(device)

                # Compute importance sampling ratio: pi_new(a|s) / pi_old(a|s)
                log_ratio = batch_new_log_probs - batch_old_log_probs
                ratio = log_ratio.exp()

                # Normalize advantages to reduce variance (optional but recommended)
                batch_advantages = advantages[batch_idxs].to(device)
                if hyper_params['norm_advantages']:
                    batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-8)

                # PPO clipped objective:
                # - Unclipped: L = ratio * advantage
                # - Clipped: L = clip(ratio, 1-ε, 1+ε) * advantage
                # Take the minimum (most conservative) to prevent large policy updates
                unclipped_obj = -ratio * batch_advantages
                clipped_obj = -torch.clip(ratio, 1 - hyper_params['clip_coef'], 1 + hyper_params['clip_coef']) * batch_advantages
                ppo_loss = torch.max(unclipped_obj, clipped_obj).sum() / hyper_params['batch_size']
                epochs_ppo_loss.append(ppo_loss.item())

                # ========== VALUE FUNCTION LOSS ==========
                # Train critic to predict returns using MSE loss
                new_state_values = agent.critic(batch_obs.to(device))
                v_loss = ((new_state_values.view(-1) - returns[batch_idxs].to(device))**2).sum() / hyper_params['batch_size']
                epochs_values_loss.append(v_loss.item())
            
                # Combined loss: policy loss + value loss (weighted)
                total_loss = ppo_loss + hyper_params['critic_coef'] * v_loss
                epochs_total_loss.append(total_loss.item())
                
                # ========== GRADIENT DESCENT ==========
                # Zero gradients
                policy_optim.zero_grad()
                critic_optim.zero_grad()
                
                # Backpropagate
                total_loss.backward()

                # Clip policy gradients to prevent exploding gradients
                nn.utils.clip_grad.clip_grad_value_(agent.policy.parameters(), clip_value=hyper_params['grad_clip_val'])
                # Handle NaN gradients by setting them to zero
                for param in agent.policy.parameters():
                    mask = torch.isnan(param.grad)
                    param.grad[mask] = 0.0
                    if mask.sum == param.numel():
                        print('code is broken, yup')  # All gradients are NaN - major issue
                # Update policy parameters
                policy_optim.step()

                # Clip critic gradients and update
                nn.utils.clip_grad.clip_grad_value_(agent.critic.parameters(), clip_value=hyper_params['grad_clip_val'])
                critic_optim.step()
                
        # ==================== LOGGING ====================
        # Print progress for this update
        print('Update', update + 1)
        print('actor loss', np.mean(epochs_ppo_loss))
        print('critic loss', np.mean(epochs_values_loss))
        print('total loss', np.mean(epochs_total_loss))
        print('total reward', cum_reward)
        
        # Track metrics for plotting
        cum_rewards.append(cum_reward)
        actor_losses.append(np.mean(epochs_ppo_loss))
        critic_losses.append(np.mean(epochs_values_loss))
        total_losses.append(np.mean(epochs_total_loss))
        
    return cum_rewards, actor_losses, critic_losses, total_losses

In [None]:
time.sleep(1)
env.reset()

Exception in thread Thread-12 (__background_thread):
Traceback (most recent call last):
  File "/usr/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.12/dist-packages/rtgym/envs/real_time_env.py", line 763, in __background_thread
    self.__reset_result = self.__interface.reset(seed=seed, options=options)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/tmrl/custom/tm/tm_gym_interfaces.py", line 147, in reset
    self.reset_common()
  File "/usr/local/lib/python3.12/dist-packages/tmrl/custom/tm/tm_gym_interfaces.py", line 137, in reset_common
    self.initialize()
  File "/usr/local/lib/python3.12/dist-packages/tmrl/custom/tm/tm_gym_interfaces.py", line 89, in initialize
    self.initialize_common()
  File "/usr/local/lib/python3.12/dist-packages/tm

KeyboardInterrupt: 

In [None]:
cum_rewards, actor_losses, critic_losses, total_losses = train_PPO()

  clamped_action = np.clip(np.array(action.cpu()),-1,1)


Update 1
actor loss nan
critic loss nan
total loss nan
total reward 0.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [8]:
torch.distributions.MultivariateNormal(torch.zeros(3), torch.randn(9).reshape(3,3).exp())

ValueError: Expected parameter covariance_matrix (Tensor of shape (3, 3)) of distribution MultivariateNormal(loc: torch.Size([3]), covariance_matrix: torch.Size([3, 3])) to satisfy the constraint PositiveDefinite(), but found invalid values:
tensor([[ 0.4632, 10.1557,  0.2630],
        [ 2.6142,  1.1253,  3.7585],
        [ 0.4767,  0.2665,  0.3334]])

In [9]:
def evaluate_model(model, env):
    done = False
    time.sleep(1.0)
    next_state = env_obs_to_tensor(env.reset()[0]).to(device)
    model.to(device)
    model.eval()
    while not done:
        action, logprob = model.policy.sample_action_with_logprobs(next_state.to(device).unsqueeze(0))
        #action = model.policy.mean_only(next_state.to(device).unsqueeze(0))
        clamped_action = np.clip((action.detach().cpu().numpy())[0], -1,1)
        next_state, reward, done, truncated, info = env.step(clamped_action)
        next_state = env_obs_to_tensor(next_state)

In [11]:
agent = Agent()
agent.load_state_dict(torch.load('Y231.38RewardRacer164Update_2.pt'))

RuntimeError: Error(s) in loading state_dict for Agent:
	size mismatch for policy.action_mean.0.weight: copying a param with shape torch.Size([512, 84]) from checkpoint, the shape in current model is torch.Size([512, 602121]).
	size mismatch for policy.actor_logvar.0.weight: copying a param with shape torch.Size([512, 84]) from checkpoint, the shape in current model is torch.Size([512, 602121]).
	size mismatch for critic.network.0.weight: copying a param with shape torch.Size([512, 84]) from checkpoint, the shape in current model is torch.Size([512, 602121]).

In [14]:
env.reset()
pass



In [15]:
evaluate_model(agent, env)



KeyboardInterrupt: 