In [1]:
from simulationworkcopy import ArmEnv
from wrapper import GymWrapper
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
import numpy as np
from simulationworkcopy import ObjectDetector



In [2]:
class ArmPolicy(nn.Module):
    def __init__(self, input_size, output_size):
        super(ArmPolicy, self).__init__()

        input_size = 307209  # or input_size = (307204,) 

        self.fc = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, output_size),
            nn.Softmax(dim=-1)
        )
        self.object_detector = ObjectDetector()

    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        x = self.fc(x)
        return x

    def detect_object(self, x):
        return self.object_detector.detect(x)  # Call object detection component

In [3]:
class ArmDataset(IterableDataset):
    def __init__(self, env):
        super(ArmDataset, self).__init__()
        self.env = env

    def __iter__(self):
        while True:
            observation = self.env.reset()
            done = False
            while not done:
                action = self.env.action_space.sample()
                next_observation, reward, done, _ = self.env.step(action)
                yield observation, action, reward
                observation = next_observation

In [4]:
def ppo_update(policy, optimizer, states, actions, rewards, clip_param=0.2, epochs=10, batch_size=64):
    dataset = list(zip(states, actions, rewards))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for _ in range(epochs):
        for batch in dataloader:
            batch_states, batch_actions, batch_rewards = batch

            logits = policy(batch_states)
            dist = Categorical(logits=logits)

            log_probs = []
            for action, logit in zip(batch_actions, logits):
                action_dist = Categorical(logits=logit)
                action = action.long()  # Convert action to integer
                log_prob = action_dist.log_prob(action)
                log_probs.append(log_prob)

            log_probs = torch.stack(log_probs)

            old_logits = policy(batch_states).detach()
            old_dist = Categorical(logits=old_logits)

            old_log_probs = []
            for action, old_logit in zip(batch_actions, old_logits):
                old_action_dist = Categorical(logits=old_logit)
                action = action.long()  # Convert action to integer
                old_log_prob = old_action_dist.log_prob(action)
                old_log_probs.append(old_log_prob)

            old_log_probs = torch.stack(old_log_probs)

            ratio = torch.exp(log_probs - old_log_probs)
            ratio = ratio.view(-1, 1)  # Reshape ratio to match batch_rewards shape
            surr1 = ratio * batch_rewards
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * batch_rewards
            surr1 = surr1.to(surr2.dtype)
            actor_loss = -torch.min(surr1, surr2).mean()

            optimizer.zero_grad()
            actor_loss.backward()
            optimizer.step()

    return


In [5]:
def evaluate(policy, env, num_eval_episodes=10):
    total_rewards = []

    for _ in range(num_eval_episodes):
        observation = env.reset()
        done = False

        total_reward = 0

        while not done:
            with torch.no_grad():
                logits = policy(torch.tensor(observation, dtype=torch.float32))
            action = Categorical(logits=logits).sample().numpy()

            # Choose action using the policy
            #action = policy(torch.tensor(observation).float())

            # Take the action in the environment
            next_observation, reward, done, _ = env.step(action)

            # Accumulate the total reward
            total_reward += reward

            observation = next_observation

        total_rewards.append(total_reward)

    avg_reward = sum(total_rewards) / len(total_rewards)
    #avg_reward = total_reward / num_eval_episodes
    return avg_reward

In [6]:
import os
save_path = "C:/Users/User/Documents/drone detection/robot"

# Create the environment
env = ArmEnv()
wrapped_env = GymWrapper(env)

def train(env, num_episodes=5, save_path = "C:/Users/User/Documents/drone detection/robot/modelx.pth"):
    if len(env.observation_space.shape) == 1:
        input_size = env.observation_space.shape[0]
    else:
        raise ValueError("Unsupported observation space shape.")

    output_size = env.action_space.shape[0]
    
    policy = ArmPolicy(input_size, output_size)

    optimizer = optim.SGD(policy.parameters(), lr=0.01, momentum=0.9)
    optimizer = optim.Adam(policy.parameters(), lr=0.001)

    dataset = ArmDataset(env)

    episode_rewards = []  # Store episode rewards
    object_detection_results = []  # Store object detection results

    start_episode = 0
    checkpoint = torch.load(save_path) if os.path.exists(save_path) else None
    if checkpoint is not None:
        start_episode = checkpoint['episode'] + 1
        policy.load_state_dict(checkpoint['policy_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        episode_rewards = checkpoint['episode_rewards']
        object_detection_results = checkpoint['object_detection_results']

    for episode in range(start_episode, num_episodes):
        states, actions, rewards = [], [], []
                
        for observation, action, reward in dataset:
            states.append(observation)
            actions.append(action)
            rewards.append(reward)

            if len(states) >= 5:
                ppo_update(policy, optimizer, states, actions, rewards)
                states, actions, rewards = [], [], []

            if len(states) >= 5:
                break

        if episode % 10 == 0:
            total_reward = evaluate(policy, env)
            print(f"Episode {episode}, Total Reward: {total_reward}")

        # Real-time training progress
        if episode % 5 == 0:
            print(f"Episode: {episode}/{num_episodes}")

        # Object detection and alert
        detected_objects = policy.detect_object(states)
        if "Complex structure" in detected_objects:
            print("Alert: Complex structure detected!")
            object_detection_results.append(detected_objects)

        # Save the model and training data after each episode
        save_checkpoint(policy, optimizer, states, actions, rewards, episode, episode_rewards, object_detection_results, save_path)

    env.close()

    # Return the stored tensors
    return episode_rewards, object_detection_results

def save_checkpoint(policy, optimizer, states, actions, rewards, episode, episode_rewards, object_detection_results, save_path):
    checkpoint = {
        'episode': episode,
        'policy_state_dict': policy.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'states': states,
        'actions': actions,
        'rewards': rewards,
        'episode_rewards': episode_rewards,
        'object_detection_results': object_detection_results
    }
    torch.save(checkpoint, save_path)
    print(f"Checkpoint saved for episode {episode}.")

# Set a random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Train the agent and save the model
train(env=wrapped_env, save_path=r"C:\Users\User\Documents\drone detection\robot")
print("Training completed and model saved.")

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\User\\Documents\\drone detection\\robot'

In [None]:
# def train(env, num_episodes=100, save_path="modelx.pth"):
#     if len(env.observation_space.shape) == 1:
#         input_size = env.observation_space.shape[0]
        
#     else:
#         raise ValueError("Unsupported observation space shape.")

#     output_size = env.action_space.shape[0]
    
#     policy = ArmPolicy(input_size, output_size)

#     optimizer = optim.SGD(policy.parameters(), lr=0.01, momentum=0.9)
#     optimizer = optim.Adam(policy.parameters(), lr=0.001)

#     dataset = ArmDataset(env)

#     episode_rewards = []  # Store episode rewards
#     object_detection_results = []  # Store object detection results

#     for episode in range(num_episodes):
#         states, actions, rewards = [], [], []
                
#         for observation, action, reward in dataset:
#             states.append(observation)
#             actions.append(action)
#             rewards.append(reward)

#             if len(states) >= 64:
#                 ppo_update(policy, optimizer, states, actions, rewards)
#                 states, actions, rewards = [], [], []

#             if len(states) >= 100:
#                 break

#         if episode % 10 == 0:
#             total_reward = evaluate(policy, env)
#             print(f"Episode {episode}, Total Reward: {total_reward}")

#         # Real-time training progress
#         if episode % 100 == 0:
#             print(f"Episode: {episode}/{num_episodes}")

#         # Object detection and alert
#         detected_objects = policy.detect_object(states)
#         if "Complex structure" in detected_objects:
#             print("Alert: Complex structure detected!")
#             object_detection_results.append(detected_objects)

#     # Save the trained model
#     torch.save(policy.state_dict(), save_path)
#     env.close()

#      # Return the stored tensors
#     return episode_rewards, object_detection_results

# def evaluate(policy, env, num_eval_episodes=10):
#     total_rewards = []

#     for _ in range(num_eval_episodes):
#         observation = env.reset()
#         done = False

#         total_reward = 0

#         while not done:
#             with torch.no_grad():
#                 logits = policy(torch.tensor(observation, dtype=torch.float32))
#             action = Categorical(logits=logits).sample().numpy()

#             # Choose action using the policy
#             #action = policy(torch.tensor(observation).float())

#             # Take the action in the environment
#             next_observation, reward, done, _ = env.step(action)

#             # Accumulate the total reward
#             total_reward += reward

#             observation = next_observation

#         total_rewards.append(total_reward)

#     avg_reward = sum(total_rewards) / len(total_rewards)
#     #avg_reward = total_reward / num_eval_episodes
#     return avg_reward

In [None]:
# # Create the environment
# env = ArmEnv()
# wrapped_env = GymWrapper(env)

# # Set a random seed for reproducibility
# torch.manual_seed(42)
# np.random.seed(42)

In [None]:
# Train the agent and save the modeltrained_model.pth
# train(env=wrapped_env, save_path='modelx.pth')
# print("Training completed and model saved.")



In [None]:
# Save the trained agent
#model.save("ppo_arm")

# Load the trained agent
# loaded_model = torch.load("modelx.pth")

# # Evaluate the trained agent
# total_reward = 0
# obs = env.reset()
# done = False


# while not done:
#     action, _ = loaded_model.predict(obs)
#     obs, reward, done, _ = env.step(action)
#     total_reward += reward


# print("Total reward:", total_reward)