In [1]:
from simulation import ArmEnv
from wrapper import GymWrapper
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
import numpy as np

In [2]:
class ArmPolicy(nn.Module):
    def __init__(self, input_size, output_size):
        super(ArmPolicy, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, output_size),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)


class ArmDataset(IterableDataset):
    def __init__(self, env):
        super(ArmDataset, self).__init__()
        self.env = env

    def __iter__(self):
        while True:
            observation = self.env.reset()
            done = False
            while not done:
                action = self.env.action_space.sample()
                next_observation, reward, done, _ = self.env.step(action)
                yield observation, action, reward
                observation = next_observation


def ppo_update(policy, optimizer, states, actions, rewards, clip_param=0.2, epochs=10, batch_size=64):
    dataset = list(zip(states, actions, rewards))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for _ in range(epochs):
        for batch in dataloader:
            batch_states, batch_actions, batch_rewards = batch

            logits = policy(batch_states)
            dist = Categorical(logits=logits)
            log_probs = dist.log_prob(batch_actions)

            old_logits = policy(batch_states).detach()
            old_dist = Categorical(logits=old_logits)
            old_log_probs = old_dist.log_prob(batch_actions)

            ratio = torch.exp(log_probs - old_log_probs)
            surr1 = ratio * batch_rewards
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * batch_rewards
            actor_loss = -torch.min(surr1, surr2).mean()

            optimizer.zero_grad()
            actor_loss.backward()
            optimizer.step()


def train(env, num_episodes=1000, save_path="trained_model.pth"):
    if len(env.observation_space.shape) == 1:
        input_size = env.observation_space.shape[0]
    else:
        raise ValueError("Unsupported observation space shape.")

    output_size = env.action_space.shape[0]

    policy = ArmPolicy(input_size, output_size)
    optimizer = optim.Adam(policy.parameters(), lr=0.001)

    dataset = ArmDataset(env)

    for episode in range(num_episodes):
        states, actions, rewards = [], [], []

        for observation, action, reward in dataset:
            states.append(observation)
            actions.append(action)
            rewards.append(reward)

            if len(states) >= 64:
                ppo_update(policy, optimizer, states, actions, rewards)
                states, actions, rewards = [], [], []

            if len(states) >= 1000:
                break

        if episode % 10 == 0:
            total_reward = evaluate(policy, env)
            print(f"Episode {episode}, Total Reward: {total_reward}")

        # Real-time training progress
        if episode % 100 == 0:
            print(f"Episode: {episode}/{num_episodes}")

    # Save the trained model
    torch.save(policy.state_dict(), save_path)
    env.close()


def evaluate(policy, env, num_episodes=10):
    total_rewards = []

    for _ in range(num_episodes):
        observation = env.reset()
        done = False
        total_reward = 0

        while not done:
            with torch.no_grad():
                logits = policy(torch.tensor(observation, dtype=torch.float32))
                action = Categorical(logits=logits).sample().numpy()

            next_observation, reward, done, _ = env.step(action)
            total_reward += reward
            observation = next_observation

        total_rewards.append(total_reward)

    avg_reward = sum(total_rewards) / len(total_rewards)
    return avg_reward



In [4]:

# Create the environment
env = ArmEnv()
wrapped_env = GymWrapper(env)

# Set a random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# '''
# # Train the agent and save the model
# train(env, save_path="trained_model.pth")
# print("Training completed and model saved.")
# '''

error: Only one local in-process GUI/GUI_SERVER connection allowed. Use DIRECT connection mode or start a separate GUI physics server (ExampleBrowser, App_SharedMemoryPhysics_GUI, App_SharedMemoryPhysics_VR) and connect over SHARED_MEMORY, UDP or TCP instead.

In [9]:
from stable_baselines3 import PPO


# Define the PPO agent
model = PPO("MlpPolicy", wrapped_env, verbose=1)

# Train the agent
model.learn(total_timesteps=100000)

# Save the trained agent
model.save("ppo_arm")

# Load the trained agent
loaded_model = PPO.load("ppo_arm")

# Evaluate the trained agent
total_reward = 0
obs = env.reset()
done = False
while not done:
    action, _ = loaded_model.predict(obs)
    obs, reward, done, _ = env.step(action)
    total_reward += reward

print("Total reward:", total_reward)

ModuleNotFoundError: No module named 'stable_baselines3'