In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
import pandas as pd
import random
import graphviz
import seaborn as sns
import time
import math
from torch.nn.utils import clip_grad_norm_
from collections import deque
import torch.nn.functional as F

from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [None]:
path_1 = "/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/nn_training_data_first_half.csv"
path_2 = "/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/nn_training_data_second_half.csv"
path_3 = "/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/nn_training_data_third_half.csv"

offline_TTT_trajectories_first_half = pd.read_csv(path_1).to_numpy()
offline_TTT_trajectories_second_half = pd.read_csv(path_2).to_numpy()
offline_TTT_trajectories_third_half = pd.read_csv(path_3).to_numpy()

offline_TTT_trajectories = np.concatenate((offline_TTT_trajectories_first_half, offline_TTT_trajectories_second_half, offline_TTT_trajectories_third_half), axis=0)
#1.75 million games
print(offline_TTT_trajectories.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/nn_training_data_first_half.csv'

In [None]:
# 1. Define your environment
class TicTacToe:
  def __init__(self):
    self.board = np.zeros((3, 3))
    self.one_move_back = np.zeros((3, 3))
    self.two_moves_back = np.zeros((3, 3))

  def reset(self):
    self.board = np.zeros((3, 3))
    self.one_move_back = np.zeros((3, 3))
    self.two_moves_back = np.zeros((3, 3))
    return [self.two_moves_back, self.one_move_back, self.board]

  def check_win(self, player):
    # Check rows
    for i in range(3):
      if np.all(self.board[i, :] == player):
        return player

    # Check columns
    for j in range(3):
      if np.all(self.board[:, j] == player):
        return player

    #Check diagonal
    if np.all(np.diag(self.board) == player):
      return player
    if np.all(np.diag(np.fliplr(self.board)) == player):
      return player

    #Check tie
    if np.all(self.board != 0):
      return -1

    return 0

  # Check for empty places on board
  def possibilities(self):
    l = []
    for i in range(len(self.board)):
        for j in range(len(self.board)):

            if self.board[i][j] == 0:
                l.append((i, j))
    return(l)

  #training
  def random_step(self, player):
    selection = self.possibilities()
    current_loc = random.choice(selection)
    self.two_moves_back = self.one_move_back.copy()
    self.one_move_back = self.board.copy()
    self.board[current_loc] = player
    done = self.check_win(player)
    reward = 0
    if done == 2:
      reward = -1.5
    state = np.concatenate((self.two_moves_back, self.one_move_back, self.board), axis=0)
    return state, reward, done

  def network_step(self, action, player):
    #print(action)
    self.two_moves_back = self.one_move_back.copy()
    self.one_move_back = self.board.copy()
    row, col = action
    self.board[row, col] = player
    done = self.check_win(player)
    reward = 0
    if done == 1:
      reward = 1
    elif done == -1:
      reward = 0.1
    state = np.concatenate((self.two_moves_back, self.one_move_back, self.board), axis=0)
    return state, reward, done

#takes a 2-d numpy array and create a string representation
def numpy_array_to_string(array):
  string_array = ""
  for row in array:
    for element in row:
      string_array += str(element)
  return string_array
tensor_to_tuple = {}
for i in range(9):
  tensor_to_tuple[i] = (i//3, i%3)
print(tensor_to_tuple)

{0: (0, 0), 1: (0, 1), 2: (0, 2), 3: (1, 0), 4: (1, 1), 5: (1, 2), 6: (2, 0), 7: (2, 1), 8: (2, 2)}


In [None]:
#Original Code
# Define the Transformer model for the policy and advantage functions
class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, n_layers, n_heads, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Linear(input_dim, hidden_dim)  # Linear embedding layer
        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=n_heads,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,  # Can be 0 if not using decoder
            dropout=dropout,
            batch_first=True # Important for handling sequences
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        # For a simple sequence input, we can use a placeholder target sequence
        # or even pass the input sequence as the target.  Adjust as needed
        # for your specific problem.
        seq_len = x.size(1) # Get sequence length
        tgt = torch.zeros_like(embedded) # Placeholder target sequence
        # OR, use input as target for auto-regressive style:
        #tgt = embedded

        output = self.transformer(embedded, tgt)  # Pass through transformer
        output = self.fc(output[:, -1, :]) # Get the last timestep's output (adjust if needed)
        return output


# Define the Advantage Actor-Critic (A2C) agent
class A2CAgent:
    def __init__(self, input_dim, action_dim, hidden_dim=256, n_layers=6, n_heads=8, dropout=0.1, lr=1e-5, gamma=0.99):
        self.actor = TransformerModel(input_dim, action_dim, hidden_dim, n_layers, n_heads, dropout)
        self.critic = TransformerModel(input_dim, 1, hidden_dim, n_layers, n_heads, dropout) # Critic outputs a single value
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
        self.gamma = gamma

    def act(self, state):
        state = state.flatten()
        current_board = state[18:27].copy().reshape(3, 3)

        action_mask = np.zeros((3, 3))
        # Create action mask so sample produces a valid state prediction
        for i in range(len(action_mask)):
          for j in range(len(action_mask)):
            if current_board[(i,j)] == 0:
              action_mask[i][j] = 1
        action_mask = torch.FloatTensor(action_mask.flatten()) # Move action_mask to device

        state = torch.tensor(state, dtype=torch.float).unsqueeze(0) # Add batch dimension
        action_probs = F.softmax(self.actor(state), dim=-1)
        action = torch.multinomial(action_probs, 1).item()
        return action

    def train(self, states, actions, rewards, next_states, dones):
        states = torch.tensor(np.array(states), dtype=torch.float)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float)
        dones = torch.tensor(dones, dtype=torch.float)

        # Calculate TD target
        values = self.critic(states).squeeze(1) # Get value predictions
        next_values = self.critic(next_states).squeeze(1)
        td_target = rewards + (1 - dones) * self.gamma * next_values
        advantage = td_target - values

        # Actor loss
        action_probs = F.softmax(self.actor(states), dim=-1)
        log_probs = F.log_softmax(self.actor(states), dim=-1)
        actor_loss = -(log_probs[torch.arange(states.size(0)), actions] * advantage).mean()

        # Critic loss (MSE)
        critic_loss = advantage.pow(2).mean()

        # Update networks
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return actor_loss.item(), critic_loss.item()

In [None]:
#Modified transformer for tic tac toe
# Define the Transformer model for the policy and advantage functions
class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, n_layers, n_heads, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Linear(input_dim, hidden_dim)  # Linear embedding layer
        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=n_heads,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,  # Can be 0 if not using decoder
            dropout=dropout,
            batch_first=True # Important for handling sequences
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        # For a simple sequence input, we can use a placeholder target sequence
        # or even pass the input sequence as the target.  Adjust as needed
        # for your specific problem.
        seq_len = x.size(1) # Get sequence length
        tgt = torch.zeros_like(embedded) # Placeholder target sequence
        # OR, use input as target for auto-regressive style:
        #tgt = embedded
        output = self.transformer(embedded, tgt)
        #print("output: ", output)
        #output = output[:, -1]
        #print("output after last column: ", output)

        #output = self.transformer(embedded, tgt)  # Pass through transformer
        output = self.fc(output) # Get the last timestep's output (adjust if needed)
        #print("output after fc: ", output)
        return output


# Define the Advantage Actor-Critic (A2C) agent
class A2CAgent:
    def __init__(self, input_dim, action_dim, hidden_dim=128, n_layers=6, n_heads=8, dropout=0.1, lr=1e-5, gamma=0.99):
        self.actor = TransformerModel(input_dim, action_dim, hidden_dim, n_layers, n_heads, dropout)
        self.critic = TransformerModel(input_dim, 1, hidden_dim, n_layers, n_heads, dropout) # Critic outputs a single value
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
        self.gamma = gamma

    def act(self, state):
        state = state.flatten()
        current_board = state[18:27].copy().reshape(3, 3)

        action_mask = np.zeros((3, 3))
        # Create action mask so sample produces a valid state prediction
        for i in range(len(action_mask)):
          for j in range(len(action_mask)):
            if current_board[(i,j)] == 0:
              action_mask[i][j] = 1
        action_mask = torch.FloatTensor(action_mask.flatten()) # Move action_mask to device

        state = torch.tensor(state, dtype=torch.float).unsqueeze(0) # Add batch dimension
        probs = F.softmax(self.actor(state), dim=-1)
        masked_probs = probs * action_mask
        masked_probs = masked_probs / masked_probs.sum()
        action = torch.multinomial(masked_probs, 1).item()
        return action

    def train_actor_critic(self, states, actions, rewards, next_states, dones):
        print("rewards: ", rewards)
        states = torch.tensor(np.array(states), dtype=torch.float)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float)
        dones = torch.tensor(dones, dtype=torch.float)

        # Calculate TD target
        critic_states = states.detach()
        critic_next_states = next_states.detach()
        values = self.critic(critic_states).squeeze(1) # Get value predictions
        print("values: ", values)
        next_values = self.critic(critic_next_states).squeeze(1)
        print("next values: ", next_values)
        #values = self.critic(states).squeeze(1) # Get value predictions
        #next_values = self.critic(next_states).squeeze(1)
        td_target = rewards + (1 - dones) * self.gamma * next_values

        print("td target: ", td_target)
        advantage = td_target - values
        print("advantage: ", advantage)
        # Actor loss
        action_probs = F.softmax(self.actor(states), dim=-1)
        log_probs = F.log_softmax(self.actor(states), dim=-1)
        detached_advantage = advantage.detach()
        actor_loss = -(log_probs[torch.arange(states.size(0)), actions] * detached_advantage).mean()
        #actor_loss = -(log_probs[torch.arange(states.size(0)), actions] * advantage).mean()


        # Critic loss (MSE)
        critic_loss = advantage.pow(2).mean()
        #critic_loss = advantage.pow(2).mean()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update networks
        # self.actor_optimizer.zero_grad()
        # actor_loss.backward()
        # self.actor_optimizer.step()

        return actor_loss.item(), critic_loss.item()

In [None]:
# Example training loop
def train(env, agent, num_episodes=500000):
  for episode in range(num_episodes):
    state = [entry.flatten() for entry in env.reset()]
    state = np.concatenate((state[0], state[1], state[2]), axis=None)

    done = 0
    states = []
    next_states = []
    actions = []
    rewards = []
    dones = []
    move_counter = 0

    state = state.flatten()
    current_board = state[18:27].copy().reshape(3, 3)
    print("board after " + str(move_counter) + " moves")
    print(current_board)

    #Intra-trajectory loop
    while done == 0:
      state = state.flatten()
      current_board = state[18:27].copy().reshape(3, 3)

      action_mask = np.zeros((3, 3))
      # Create action mask so sample produces a valid state prediction
      for i in range(len(action_mask)):
        for j in range(len(action_mask)):
          if current_board[(i,j)] == 0:
            action_mask[i][j] = 1
      action_mask = torch.FloatTensor(action_mask.flatten()) # Move action_mask to device

      action = agent.act(state)

      states.append(state)
      actions.append(action)

      state, reward, done = env.network_step(tensor_to_tuple[action], player = 1)
      move_counter += 1
      print("board after " + str(move_counter) + " moves")
      state = state.flatten()
      print(state[18:27].copy().reshape(3, 3))
      next_states.append(state)


      #Player two move if nonterminal state
      if done == 0:
        state, reward, done = env.random_step(player = 2)
        move_counter += 1
        print("board after " + str(move_counter) + " moves")
        state = state.flatten()
        print(state[18:27].copy().reshape(3, 3))

      rewards.append(reward)
      dones.append(done)

    episode_rewards.append(reward)
    # print("states:")
    # for state in states:
    #   print(state)
    # print("actions: \n", actions)
    # print("rewards: \n", rewards)
    # print("next_states: \n", next_states)
    # print("dones: ", dones)
    # break

    actor_loss, critic_loss = agent.train_actor_critic(states, actions, rewards, next_states, dones)
    print(f"Episode: {episode}, Total Reward: {sum(episode_rewards)}, Actor Loss: {actor_loss}, Critic Loss: {critic_loss}")
    if episode % 50000 == 0:
      print("saved model: ", episode)
      # Save the trained policy network
      torch.save(agent.actor.state_dict(), '/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/actor critic networks/actor_network_' + str(episode/50000)+'.pth')
      torch.save(agent.critic.state_dict(), '/content/drive/MyDrive/Monarch/RL/Tic Tac Toe/Transformers/actor critic networks/critic_network_' + str(episode/50000)+'.pth')

In [None]:
env = TicTacToe()
rewards = []
input_dim = 27  # State dimension
action_dim = 9  # Number of actions
episode_rewards = []
agent = A2CAgent(input_dim, action_dim)
train(env, agent)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[[1. 1. 0.]
 [2. 0. 2.]
 [1. 0. 0.]]
board after 6 moves
[[1. 1. 0.]
 [2. 2. 2.]
 [1. 0. 0.]]
rewards:  [0, 0, -1]
values:  tensor([-0.0144, -0.0060, -0.1079], grad_fn=<SqueezeBackward1>)
next values:  tensor([-0.2323, -0.1147, -0.0778], grad_fn=<SqueezeBackward1>)
td target:  tensor([-0.2300, -0.1135, -0.9230], grad_fn=<AddBackward0>)
advantage:  tensor([-0.2156, -0.1075, -0.8151], grad_fn=<SubBackward0>)
Episode: 259729, Total Reward: 77874.80000000258, Actor Loss: -0.688973605632782, Critic Loss: 0.24079012870788574
board after 0 moves
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
board after 1 moves
[[0. 0. 1.]
 [0. 0. 0.]
 [0. 0. 0.]]
board after 2 moves
[[0. 0. 1.]
 [0. 0. 0.]
 [0. 0. 2.]]
board after 3 moves
[[0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 2.]]
board after 4 moves
[[0. 2. 1.]
 [0. 1. 0.]
 [0. 0. 2.]]
board after 5 moves
[[0. 2. 1.]
 [1. 1. 0.]
 [0. 0. 2.]]
board after 6 moves
[[0. 2. 1.]
 [1. 1. 2.]
 [0. 0. 2.]]
board after

In [None]:
cumulative_episode_rewards = []
for i in range(len(episode_rewards)):
  if i == 0:
    cumulative_episode_rewards.append(episode_rewards[i])
  else:
    cumulative_episode_rewards.append(cumulative_episode_rewards[i-1] + episode_rewards[i])


plt.plot(cumulative_episode_rewards)
plt.title('Training Reward Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

In [None]:
# Example usage (assuming you have an environment)
# env = TicTacToe()


# criterion = nn.MSELoss()
# model = TransformerModel(state_dim = 9, action_dim = 9, hidden_dim=64, nhead=4, num_layers=2)
# optimizer = optim.Adam(agent.parameters(), lr=1e-6)

# train(env, model, optimizer, criterion)

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

class ReplayBufferDataset(Dataset):
    def __init__(self, data):  # data is a list of (s, a, r, s') tuples
        self.data = data
        self.state_scaler = StandardScaler() # For state normalization

        # Fit the scaler on the states. Very important for offline RL.
        states = np.array([transition[0] for transition in data])
        self.state_scaler.fit(states)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        s, a, r, s_prime = self.data[idx]

        # Normalize state. Very important for transformers
        s = self.state_scaler.transform(np.array(s).reshape(1, -1)).flatten() # Reshape for scaler
        s_prime = self.state_scaler.transform(np.array(s_prime).reshape(1, -1)).flatten() # Reshape for scaler

        return torch.tensor(s, dtype=torch.float32), torch.tensor(a, dtype=torch.float32), torch.tensor(r, dtype=torch.float32), torch.tensor(s_prime, dtype=torch.float32)

# Example usage (assuming 'experiences' is your list of tuples):
dataset = ReplayBufferDataset(experiences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True) # shuffle for training

In [None]:
# class TransformerModel(nn.Module):
#     def __init__(self, state_dim, action_dim, embedding_dim, nhead, num_layers, dropout):
#         super().__init__()
#         self.state_embedding = nn.Linear(state_dim, embedding_dim)
#         self.action_embedding = nn.Linear(action_dim, embedding_dim)

#         self.transformer = nn.Transformer(
#             d_model=embedding_dim,
#             nhead=nhead,
#             num_encoder_layers=num_layers,
#             num_decoder_layers=num_layers,
#             dropout=dropout,
#         )

#         self.output_layer = nn.Linear(embedding_dim, state_dim)  # Predict next state

#     def forward(self, states, actions):
#         state_embeds = self.state_embedding(states)
#         action_embeds = self.action_embedding(actions)

#         # Combine state and action embeddings (e.g., concatenation or addition)
#         #  Important: Add positional encodings if using a standard transformer
#         seq = torch.cat([state_embeds.unsqueeze(0), action_embeds.unsqueeze(0)], dim=0) # sequence of [S,A]

#         # If you want to predict a sequence of the next state,
#         # you should provide a target sequence to the transformer's decoder
#         # (see the PyTorch Transformer documentation).

#         output = self.transformer(seq)

#         next_state_pred = self.output_layer(output[-1]) # Get last element for next state prediction

#         return next_state_pred


In [None]:
# #Code that I don't currently understand
# def compute_gae(rewards, values, next_value, gamma=0.99, lam=0.95):
#     """
#     Calculates Generalized Advantage Estimation (GAE).
#     """
#     returns = torch.zeros_like(rewards)
#     advantages = torch.zeros_like(rewards)

#     running_return = next_value
#     for t in reversed(range(len(rewards))):
#         running_return = rewards[t] + gamma * running_return
#         returns[t] = running_return

#     running_advantage = 0
#     for t in reversed(range(len(rewards))):
#         delta = rewards[t] + gamma * values[t+1] - values[t]
#         running_advantage = delta + gamma * lam * running_advantage
#         advantages[t] = running_advantage

#     advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
#     # advantages = advantages
#     # returns = advantages + values

#     return returns, advantages

# def ppo_update(agent, optimizer, states, actions, log_probs, returns, advantages, clip_ratio=0.2):
#     """
#     Performs a single PPO update step.
#     """
#     logits, values = agent(states)
#     dist = Categorical(logits=logits)
#     new_log_probs = dist.log_prob(actions)

#     ratio = torch.exp(new_log_probs - log_probs)
#     clipped_ratio = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio)
#     actor_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()

#     value_loss = 0.5 * (returns - values).pow(2).mean()

#     loss = actor_loss + 0.5 * value_loss

#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()




In [None]:
# def train_transformer(model, dataloader, optimizer, criterion, num_epochs = 1000):

#   for epoch in range(num_epochs):
#       for batch_states, batch_actions, batch_rewards, batch_next_states in dataloader:
#           optimizer.zero_grad()

#           next_state_pred = model(batch_states, batch_actions)

#           loss = criterion(next_state_pred, batch_next_states)
#           loss.backward()
#           optimizer.step()

#       print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

In [None]:
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    discounted_reward = 0
    i = 0
    for reward in reversed(rewards):
        discounted_reward = reward + gamma * discounted_reward
        discounted_rewards.insert(0, discounted_reward)
        # print(i," discounted_rewards: ", discounted_rewards)
        i += 1
    discounted_rewards = torch.tensor(discounted_rewards)


    #discounted_rewards = discounted_rewards / (discounted_rewards.std() + 1e-5)
    #discounted_rewards = discounted_reward - discounted_rewards.mean() / (discounted_rewards.std() + 1e-5)
    # print("discounted rewards: ", discounted_rewards)

    return discounted_rewards

def train(env, agent, optimizer, replay_buffer_size=10000, batch_size=32, num_episodes=2000, gamma=0.99, lam=0.95, clip_ratio=0.2):
    replay_buffer = deque(maxlen=replay_buffer_size)
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        states_list, actions_list, rewards_list, log_probs_list, values_list = [], [], [], [], []

        for t in range(500):  # Maximum episode length
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                logits, value = agent(state_tensor)
                probs = torch.softmax(logits, dim=-1)
                m = Categorical(probs=probs)
                action = m.sample()
                log_prob = m.log_prob(action)

            next_state, reward, done, _ = env.step(action.item())
            episode_reward += reward

            states_list.append(state)
            actions_list.append(action.item())
            rewards_list.append(reward)
            log_probs_list.append(log_prob)
            values_list.append(value.item())

            if done:
                break
            state = next_state

        # Calculate returns and advantages
        next_value = 0.0 if done else agent(torch.tensor(next_state, dtype=torch.float32).unsqueeze(0))[1].item()
        returns, advantages = compute_gae(rewards_list, values_list, next_value, gamma, lam)

        # Store transition in replay buffer
        for i in range(len(states_list)):
            replay_buffer.append((states_list[i], actions_list[i], rewards_list[i], next_states[i], dones[i],
                                 log_probs_list[i], returns[i], advantages[i]))

        # Sample a batch from the replay buffer
        batch = random.sample(replay_buffer, batch_size)
        states, actions, rewards, next_states, dones, log_probs, returns, advantages = zip(*batch)
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.bool)
        log_probs = torch.stack(log_probs)
        returns = torch.tensor(returns, dtype=torch.float32)
        advantages = torch.tensor(advantages, dtype=torch.float32)

        # PPO update
        ppo_update(agent, optimizer, states, actions, log_probs, returns, advantages, clip_ratio)

        print(f"Episode: {episode}, Reward: {episode_reward}")