In [1]:
import numpy as np
import tqdm
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from collections import deque
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
state_size = 12*4
action_size = 2

load_model = False
train_mode = True

batch_size = 128
mem_maxlen = 10000
discount_factor = 0.9
actor_lr = 1e-4
critic_lr = 5e-4
tau = 1e-3

mu = 0
theta = 1e-3
sigma = 2e-3

train_step = 300000 if train_mode else 0
save_step = 10000
test_step = 10000
train_start_step = 2000

print_interval = 10
save_interval = 100

In [3]:
game = "Kart1.exe"
file_path = f"./{game}"
save_path = f"./"
load_path = f"./"

engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=file_path,
                       side_channels=[engine_configuration_channel],
                       worker_id=1)
env.reset()
behavior_name = list(env.behavior_specs.keys())[0]
spec = env.behavior_specs[behavior_name]

# test

In [None]:
np.random.seed(seed=10)
np.array([np.random.rand(2)*2-1])

array([[ 0.54264129, -0.9584961 ]])

In [None]:
np.array([[np.random.rand()*2-1,1]])

array([[0.26729647, 1.        ]])

# DDPG

In [None]:
'''
Actor Network
    Input: state
    Output: action
    Update: actor_optimizer
Critic Network
    Input: state, action
    Output: Q-value
    Update: critic_optimizer    
'''
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = F.gelu(self.fc1(x))
        x = F.gelu(self.fc2(x))
        action = F.softmax(self.fc3(x))
        return action

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, state, action):
        x = F.gelu(self.fc1(torch.cat([state, action], dim=1)))
        x = F.gelu(self.fc2(x))
        value = self.fc3(x)
        return value

class OrnsteinUhlenbeckNoise:
    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dim) * self.mu

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state
        
class DDPG_1:
    def __init__(self, state_dim, action_dim, lr_actor=1e-3, lr_critic=1e-3, gamma=0.99):
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim, action_dim)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.gamma = gamma

        self.noise = OrnsteinUhlenbeckNoise(action_dim)

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action = self.actor(state).squeeze().numpy()
        noise = self.noise.sample()
        return np.clip(action + noise, -1, 1)  # assuming action space is -1 to 1

    def update(self, state, action, reward, next_state, done):
        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        reward = torch.FloatTensor([reward])
        next_state = torch.FloatTensor(next_state)
        done = torch.FloatTensor([done])

        # Compute the Q-value target
        target_value = reward + (1.0 - done) * self.gamma * self.critic(next_state, self.actor(next_state))

        # Update the critic
        current_value = self.critic(state, action)
        critic_loss = nn.MSELoss()(current_value, target_value)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update the actor
        actor_loss = -self.critic(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

# DQN - YT

In [6]:
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        state_list, action_list, reward_list, s_prime_list, done_mask_list = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            state_list.append(s)
            action_list.append([a])
            reward_list.append([r])
            s_prime_list.append(s_prime)
            done_mask_list.append([done_mask])

        return torch.tensor(state_list, dtype=torch.float), torch.tensor(action_list), \
               torch.tensor(reward_list), torch.tensor(s_prime_list, dtype=torch.float), \
               torch.tensor(done_mask_list)
    
    def size(self):
        return len(self.buffer)


In [None]:
class Actor(nn.Module):
    def __init__(self, STATE_DIM, ACTION_DIM):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(STATE_DIM, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, ACTION_DIM)

    def forward(self, x):
        x = F.gelu(self.fc1(x))
        x = F.gelu(self.fc2(x))
        action = F.softmax(self.fc3(x))
        return action
    
    def sample_action(self, state, epsilon):
        if epsilon > random.random():
            return np.array([[np.random.rand()*2-1,1]])
            
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action = self.forward(state).sqeeze().numpy()
        return action

class Critic(nn.Module):
    def __init__(self, STATE_DIM, ACTION_DIM):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(STATE_DIM + ACTION_DIM, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, state, action):
        x = F.gelu(self.fc1(torch.cat([state, action], dim=1)))
        x = F.gelu(self.fc2(x))
        value = self.fc3(x)
        return value

In [None]:
class DDPG():
    def __init__(self, STATE_DIM, ACTION_DIM, lr_actor=1e-3, lr_critic=1e-3, gamma=0.99):
        self.actor = Actor(STATE_DIM, ACTION_DIM)
        self.critic = Critic(STATE_DIM, ACTION_DIM)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.gamma = gamma

    def update(self, state, action, reward, next_state, done):
        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        reward = torch.FloatTensor([reward])
        next_state = torch.FloatTensor(next_state)
        done = torch.FloatTensor([done])

        # Compute the Q-value target
        target_value = reward + (1.0 - done) * self.gamma * self.critic(next_state, self.actor(next_state))

        # Update the critic
        current_value = self.critic(state, action)
        critic_loss = F.mse_loss(current_value, target_value)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update the actor
        actor_loss = -self.critic(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()


In [None]:
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)

        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
model = DDPG(STATE_DIM, ACTION_DIM,lr_actor=actor_lr, lr_critic=critic_lr, gamma=0.99)
model.actor.load_state_dict(q.state_dict())
memory = ReplayBuffer()

print_interval = 20
score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

for n_epi in range(10000):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s, _ = env.reset()
    done = False

    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
        s_prime, r, done, truncated, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r/100.0,s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break

    if memory.size()>2000:
        train(q, q_target, memory, optimizer)

    if n_epi%print_interval==0 and n_epi!=0:
        q_target.load_state_dict(q.state_dict())
        print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                        n_epi, score/print_interval, memory.size(), epsilon*100))
        score = 0.0
env.close()

# PPO - ref

In [90]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        self.fc1   = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v  = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        state_list, action_list, reward_list, s_prime_list, prob_action_list, done_list = [], [], [], [], [], []
        for transition in self.data:
            state, action, reward, s_prime, prob_a, done = transition
            
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            s_prime_list.append(s_prime)
            prob_action_list.append([prob_a])
            done_mask = 0 if done else 1
            done_list.append([done_mask])
            
        s,a,r,s_prime,done_mask, prob_a =   torch.tensor(state_list, dtype=torch.float), \
                                            torch.tensor(action_list), \
                                            torch.tensor(reward_list), \
                                            torch.tensor(s_prime_list, dtype=torch.float), \
                                            torch.tensor(done_list, dtype=torch.float), \
                                            torch.tensor(prob_action_list)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a
        
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [None]:
# main -for me
env = gym.make('CartPole-v1')
model = PPO()
score = 0.0
print_interval = 20
for n_epi in range(10000):
    state = 받아와야함
    done = False
    while not done:
        for t in range(T_horizon):
            prob = model.pi(torch.from_numpy(state).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, truncated, info = env.step(state)

            model.put_data((state, a, r/100.0, s_prime, prob[a].item(), done))
            state = s_prime

            score += r
            if done:
                break

        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0

In [None]:
# main
env = gym.make('CartPole-v1')
model = PPO()
score = 0.0
print_interval = 20
for n_epi in range(10000):
    s, _ = env.reset()
    done = False
    while not done:
        for t in range(T_horizon):
            prob = model.pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, truncated, info = env.step(a)

            model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
            s = s_prime

            score += r
            if done:
                break

        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0

# train

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

In [5]:
# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(48, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )
        
    def forward(self, state):
        return self.fc(state)

# Define the value network
class ValueNetwork(nn.Module):
    def __init__(self):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(48, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, state):
        return self.fc(state)

# Define the PPO algorithm
class PPO:
    def __init__(self, policy, value, policy_optimizer, value_optimizer, clip_epsilon=0.2, epochs=10):
        self.policy = policy
        self.value = value
        self.policy_optimizer = policy_optimizer
        self.value_optimizer = value_optimizer
        self.clip_epsilon = clip_epsilon
        self.epochs = epochs

    def update(self, old_log_probs, states, actions, returns, advantages):
        for _ in range(self.epochs):
            # Policy update
            mu = self.policy(states)
            std = torch.tensor([0.1, 0.1])  # You might want to learn this as well
            dist = Normal(mu, std)
            log_prob = dist.log_prob(actions)
            ratio = (log_prob - old_log_probs).exp()
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon) * advantages
            policy_loss = -torch.min(surr1, surr2).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # Value update
            value_loss = ((returns - self.value(states)) ** 2).mean()

            self.value_optimizer.zero_grad()
            value_loss.backward()
            self.value_optimizer.step()

policy = PolicyNetwork()
value = ValueNetwork()
policy_optimizer = optim.Adam(policy.parameters(), lr=1e-3)
value_optimizer = optim.Adam(value.parameters(), lr=1e-3)
ppo = PPO(policy, value, policy_optimizer, value_optimizer)

# main part

In [95]:
episode_num = 3
episode_list = []

for ep in range(episode_num):
    epsilon = max(1e-2, (8e-2)-(1e-2)*(n_epi/200))
    env.reset()
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    tracked_agent = -1
    done = False
    ep_rewards = 0
    
    # init action probability
    # action_prob = np.array([np.random.rand(2)*2-1])

    # init episode record
    action_list = []
    state_list = []

    while not done:
        if tracked_agent == -1 and len(decision_steps) >= 1:
            tracked_agent = decision_steps.agent_id[0]
        
        """action"""
        # action = np.array([[np.random.rand()*2-1,1]])
        # action = sample_action(torch.from_numpy(s).float(), epsilon)

        
        action_tuple = ActionTuple()
        print(f"{ep=}",end=' \ ')
        action_tuple.add_continuous(action)
        print(f"{action=}")
        env.set_actions(behavior_name, action_tuple)
        env.step()
        decision_steps, terminal_steps = env.get_steps(behavior_name)

        ####
        current_state = decision_steps.obs[0]
        terminal_state = terminal_steps.obs[0]
        print(f"{current_state=}\n{teminal_state=}")

        action_list.append(action)
        state_list.append(current_state)
        if terminal_state.size > 0:
            episode_list.append([action_list,state_list])
            done=True
        
        ####
        ep_rewards += sum(terminal_steps) # 내가 원하는 합으로 수정해야함
        print(f"{ep_rewards=}")


NameError: name 'n_epi' is not defined

In [86]:
episode_list[0][0]

[array([[-0.27888732,  1.        ]]),
 array([[0.07925139, 1.        ]]),
 array([[0.84834243, 1.        ]]),
 array([[0.13605731, 1.        ]]),
 array([[0.4503492, 1.       ]]),
 array([[0.64136846, 1.        ]]),
 array([[-0.95854065,  1.        ]]),
 array([[-0.87348785,  1.        ]]),
 array([[0.66416878, 1.        ]]),
 array([[0.02361138, 1.        ]]),
 array([[-0.49751712,  1.        ]]),
 array([[-0.6713075,  1.       ]]),
 array([[0.88616192, 1.        ]]),
 array([[0.33738406, 1.        ]]),
 array([[-0.91918758,  1.        ]]),
 array([[-0.3419065,  1.       ]]),
 array([[0.5567389, 1.       ]]),
 array([[-0.65967951,  1.        ]]),
 array([[0.27326759, 1.        ]]),
 array([[-0.27440299,  1.        ]]),
 array([[0.15318237, 1.        ]]),
 array([[0.03314612, 1.        ]]),
 array([[0.2753034, 1.       ]]),
 array([[-0.45096082,  1.        ]]),
 array([[0.41525523, 1.        ]]),
 array([[-0.31716341,  1.        ]]),
 array([[-0.84728946,  1.        ]]),
 array([[-0.60