In [21]:
env.close()

In [8]:
# imports and initialise env

import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
import numpy as np
from mlagents_envs.environment import ActionTuple
import os

# file_name = "C:\\Users\\Palaash.HPZ\\Desktop\\RL-concept-learning_large_build_envs\\build_envs\\windows\\S2 180723\\build"
file_name=r"C:\Users\Palaash.HPZ\Desktop\RL-concept-learning_large_build_envs\build_envs\windows\S0_200723\build"

env =  UE(file_name=file_name,seed=1,side_channels=[],worker_id=1,no_graphics = False)
env.reset()

behavior_names = list(env.behavior_specs.keys())
behavior_value = list(env.behavior_specs.values())
DecisionSteps, TerminalSteps = env.get_steps(behavior_names[0])
agentsNum = len(DecisionSteps.agent_id)

In [19]:
# model classes and ppo memory class

#
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

vision_output_dim = 3136
num_words = 35  # Number of unique words in the vocabulary
language_output_dim = 128
embedding_dim = 128
mixing_dim = 256
lstm_hidden_dim = 256
num_actions = 4
LR=3.5e-5

# (3,128,128) --> (64,7,7) = 3136 (3-layer CNN)
class VisualModule(nn.Module): 
    def __init__(self):
        super(VisualModule, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=3, padding=0),
            nn.ReLU()
        )

    def forward(self, vt):
        encoded_vt = self.conv(vt)
        return encoded_vt.view(vt.size(0), -1).squeeze()
 
# one-hot encoding [0 0 1 0 0] --> 128 dimensional embedding (FF)
# S1:5 S2:5 S3:11 S4:9 --> 30 + 5 (noun) = 35 in total
class LanguageModule(nn.Module): 
    def __init__(self, num_words, embedding_dim):
        super(LanguageModule, self).__init__()
        self.embedding = nn.Linear(num_words, embedding_dim)

    def forward(self, lt):
        embedded_lt = self.embedding(lt)
        return embedded_lt

# 3136(vision) + 128 (language) --> 256 dimensional embedding (FF)
class MixingModule(nn.Module):
    def __init__(self, vision_output_dim, language_output_dim, mixing_dim):
        super(MixingModule, self).__init__()
        self.linear = nn.Linear(vision_output_dim + language_output_dim, mixing_dim)

    def forward(self, vision_output, language_output):
        combined_output = torch.cat((vision_output, language_output), dim=0)
        mixed_output = self.linear(combined_output)
        return mixed_output

class LSTMModule(nn.Module):
    def __init__(self,mixing_dim,lstm_hidden_dim):
        super(LSTMModule, self).__init__()
        self.lstm = nn.LSTMCell(mixing_dim, lstm_hidden_dim)
    
    def forward(self,mixed_output,lstm_hidden_state):
        lstm_hidden_state = self.lstm(mixed_output, lstm_hidden_state) 
        # lstm_output = lstm_hidden_state[0] # output is (hidden_state,cell_state), we need hidden state, shape (1,256)
        return lstm_hidden_state

class ActorNetwork(nn.Module):
    def __init__(self,input_dims,n_actions):
        super(ActorNetwork,self).__init__()
        self.actor=nn.Sequential(
            nn.Linear(input_dims,256),
            nn.ReLU(),
            nn.Linear(256,64),
            nn.ReLU(),
            nn.Linear(64,n_actions),
#             nn.Softmax(dim=-1)
        )
    
    def forward(self,input_dims,n_actions):
        prob_dist = self.actor(input_dims,n_actions)
#         prob_dist = Categorical(prob_dist)
        return prob_dist
    
class CriticNetwork(nn.Module):
    def __init__(self,input_dims):
        super(CriticNetwork,self).__init__()
        self.actor=nn.Sequential(
            nn.Linear(input_dims,256),
            nn.ReLU(),
            nn.Linear(256,64),
            nn.ReLU(),
            nn.Linear(64,1),
        )
    
    def forward(self,input_dims):
        value = self.actor(input_dims)
        return value
        
class PPOModel(nn.Module):
    def __init__(self, num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions):
        super(PPOModel, self).__init__()
        self.language_module = LanguageModule(num_words, embedding_dim)
        self.visual_module = VisualModule()
        self.mixing_module = MixingModule(vision_output_dim, language_output_dim, mixing_dim)
        self.lstm_module = LSTMModule(mixing_dim, lstm_hidden_dim)
        self.action_predictor = ActorNetwork(lstm_hidden_dim,num_actions)
        self.value_estimator = CriticNetwork(lstm_hidden_dim)
#         self.optimizer=optim.Adam(self.parameters(),lr=lr)

    def forward(self, vt, lt, lstm_hidden_state):
        vision_output = self.visual_module(vt)
        language_output = self.language_module(lt)
        mixed_output = self.mixing_module(vision_output, language_output).unsqueeze(0)
        lstm_output = self.lstm_module(mixed_output,lstm_hidden_state)
        action_probs = self.action_predictor(lstm_output[0]) 
        value_estimate = self.value_estimator(lstm_output[0])
        return action_probs,value_estimate,lstm_output
        
    def save(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        if not os.path.exists(path):
            os.makedirs(path)
        torch.save(self.state_dict(), os.path.join(path, f'agent_{episode}.pt'))

    def load(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
        self.load_state_dict(torch.load(os.path.join(path, f'agent_{episode}.pt')))   
        
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []
                
# class Agent:
#     def __init__(self, n_actions, input_dims, LR, gamma=0.99, gae_lambda=0.95,
#             policy_clip=0.2, batch_size=64, n_epochs=10):
#         self.gamma = gamma
#         self.policy_clip = policy_clip
#         self.n_epochs = n_epochs
#         self.gae_lambda = gae_lambda

#         self.PPOModel = PPOModel(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions,LR)
#         self.memory = PPOMemory(batch_size)
       
#     def remember(self, state, action, probs, vals, reward, done):
#         self.memory.store_memory(state, action, probs, vals, reward, done)

#     def save_models(self,episode, ALG_NAME, ENV_ID):
#         print('... saving models ...')
#         self.PPOModel.save(episode, ALG_NAME, ENV_ID)

#     def load_models(self,episode, ALG_NAME, ENV_ID):
#         print('... loading models ...')
#         self.PPOModel.load(episode, ALG_NAME, ENV_ID)


In [20]:
a = PPOModel(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
a
# output of PPOModel(vt,lt,lstm_hidden_state) is action_probs,value_estimate,lstm_output
# action_probs is after Categorical(nn.Softmax)

PPOModel(
  (language_module): LanguageModule(
    (embedding): Linear(in_features=35, out_features=128, bias=True)
  )
  (visual_module): VisualModule(
    (conv): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(3, 3), padding=(1, 1))
      (3): ReLU()
      (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(3, 3))
      (5): ReLU()
    )
  )
  (mixing_module): MixingModule(
    (linear): Linear(in_features=3264, out_features=256, bias=True)
  )
  (lstm_module): LSTMModule(
    (lstm): LSTMCell(256, 256)
  )
  (action_predictor): ActorNetwork(
    (actor): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
  (value_estimator): CriticNetwork(
    (actor): Sequential(
      (0): Linear(in

In [None]:
import argparse
import time
import json
import matplotlib.pyplot as plt
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

entropy_term = 0
train = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device) 

ALG_NAME = 'S0_ppo'
ENV_ID = '1'
TRAIN_EPISODES = 15000  # number of overall episodes for training  # number of overall episodes for testing
MAX_STEPS = 500  # maximum time step in one episode
LAM = 0.95  # reward discount in TD error
lr = 3.5e-5  #0.00005 
speed = 3
num_steps = 250 # the step for updating the network
batch_size=5

# test_episode = 0
if __name__ == '__main__':
    agent = PPOModel(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
    agent.to(device)
    optimizer = optim.RMSprop(agent.parameters(), lr=lr)
    best_score = 0
    memory=PPOMemory(batch_size)
    if train:
        entropy_term = 0
#         test_episode_reward = []
#         test_average_reward = []
#         test_steps = []
#         test_actor_loss = []
#         test_critic_loss = []
#         test_entropy_loss = []
#         test_total_loss = []
        tracked_agent = -1
#         test_episode = 0
        all_episode_reward = []
        all_average_reward = []
        all_steps = []
        all_actor_loss = []
        all_critic_loss = []
        all_entropy_loss = []
        all_total_loss = []
#         env = env_train
        
        for episode in range(TRAIN_EPISODES):
            t0 = time.time()
            episode_reward = 0
            # env.reset()
            behavior_name=list(env.behavior_specs)[0]
            spec=env.behavior_specs[behavior_name]
            # state = env.reset().astype(np.float32)
            STEPS = 0

            decision_steps, terminal_steps = env.get_steps(behavior_name)
            # state -- vt, lt, lstm
            vt = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
#             index = 5 #e.g
#             lt = torch.eye(num_words)[:, index].to(device)
            lt = torch.zeros(num_words).to(device)
            lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim).to(device), torch.zeros(1, lstm_hidden_dim).to(device))
            
            done = False
            while True:

                # Need to use when calculating the loss
                log_probs = []
                # values = []
                values = torch.empty(0).to(device)
                rewards = []

                for steps in range(num_steps):
                    lstm_hidden_state = tuple(tensor.detach() for tensor in lstm_hidden_state)
                    STEPS += 1
                    policy_dist, value, lstm_hidden_state = agent(vt,lt,lstm_hidden_state)
                    # value = value.detach()
                    dist = F.softmax(policy_dist.detach(),dim=1).cpu().numpy()
                    

                    action_dist = Categorical(F.softmax(policy_dist.detach(),dim=1))
                    # action_dist = Categorical(F.softmax(policy_dist,dim=1))
                    action = action_dist.sample() # sample an action from action_dist
                    action_onehot = F.one_hot(torch.tensor(action),num_actions).cpu()

                    discrete_actions = np.array(action_onehot).reshape(1,4)*speed
                    action_tuple = ActionTuple()
                    action_tuple.add_discrete(discrete_actions)
                    env.set_actions(behavior_name,action_tuple)
                    env.step()
                    decision_steps, terminal_steps = env.get_steps(behavior_name)

                    log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # log_prob = torch.log(F.softmax(policy_dist,dim=1)[0][action])
                    # entropy = -np.sum(np.mean(dist)* np.log(dist))
                    entropy = F.cross_entropy(policy_dist.detach(), action)
                    
                    memory.remember()
                    
                    if tracked_agent == -1 and len(decision_steps) >= 1:
                        tracked_agent = decision_steps.agent_id[0]
                        # print(tracked_agent)

                    if tracked_agent in terminal_steps: # roll over or hit the target
                        print('Agent in terminal steps')
                        done = True
                        reward = terminal_steps[tracked_agent].reward
                        if reward > 0:
                            pass
                        else: reward = -1 # roll over or other unseen conditions

                        print(f'Terminal Step reward: {reward}')

                    elif tracked_agent in decision_steps: # the agent which requires action
                        reward = decision_steps[tracked_agent].reward
                        # print(f'Decision Step reward: {reward}')
                        if reward<0:
                            print(f'Decision Step reward: {reward}')
                    if STEPS >= MAX_STEPS:
                        reward = -10
                        print(f'Max Step Reward: {reward}')
                        env.reset()
                        done = True
                    if STEPS % num_steps == 0:
                        print (f'Step: {STEPS}')

                    episode_reward = episode_reward + reward

                    rewards.append(reward)
                    # values.append(value)
                    values = torch.cat((values, value), dim=0)
                    log_probs.append(log_prob)
                    entropy_term = entropy_term + entropy
                    vt_new = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
                    vt = vt_new

                    if done or steps == num_steps-1:
                        # _, Qval,_ = agent(vt_new,lt,lstm_hidden_state)
                        # Qval = Qval.detach()
                        break
                
                
                discounted_rewards = np.zeros_like(values.cpu().detach().numpy())
                cumulative = 0
                for t in reversed(range(len(rewards))):
                    cumulative = rewards[t] + LAM * cumulative # Monte Carlo
                    discounted_rewards[t] = cumulative
                
                #update actor critic
                
                # values = torch.FloatTensor(values).requires_grad_(True).to(device)
                discounted_rewards = torch.FloatTensor(discounted_rewards.astype(np.float32)).to(device)
                log_probs = torch.stack(log_probs)
                advantage = discounted_rewards - values
                actor_loss = (-log_probs * advantage).mean()
                critic_loss = 0.5 * torch.square(advantage).mean()
                entropy_term /= num_steps
                entropy_loss = -0.1 * entropy_term
                ac_loss = actor_loss + critic_loss + entropy_loss
                # ac_loss = values.mean()
                optimizer.zero_grad()
                ac_loss.backward()
                optimizer.step()
                
                if done: break


            all_episode_reward.append(float(episode_reward))
            all_steps.append(STEPS)
            all_actor_loss.append(float(actor_loss))
            all_critic_loss.append(float(critic_loss))
            all_entropy_loss.append(float(entropy_loss))
            all_total_loss.append(float(ac_loss))
            if episode >= 100:
                avg_score = np.mean(all_episode_reward[-100:])
                all_average_reward.append(avg_score)
                if avg_score > best_score:
                    best_score = avg_score
                    agent.save(episode, ALG_NAME, ENV_ID)
                    print(f'-----The best score for averaging previous 100 episode reward is {best_score}. Model has been saved-----')
                print('Training  | Episode: {}/{}  | Episode Reward: {:.0f}  | Average Reward {:.2f}  | Actor loss: {:.2f} | Critic loss: {:.2f} | Entropy loss: {:.4f}  | Total Loss: {:.2f} | Total Steps: {}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, avg_score, actor_loss, critic_loss,entropy_loss,  ac_loss, STEPS))
            else:  print('Training  | Episode: {}/{}  | Episode Reward: {:.0f}  | Actor loss: {:.2f} | Critic loss: {:.2f} | Entropy loss: {:.4f}  | Total Loss: {:.2f} | Total Steps: {}' \
                    .format(episode + 1, TRAIN_EPISODES, episode_reward, actor_loss, critic_loss, entropy_loss,  ac_loss, STEPS))
            if episode%500 == 0:
                    agent.save(episode, ALG_NAME, ENV_ID)
                    print("Model has been saved")
#             if episode%100 == 0:
#                 test_episode,test_episode_reward,test_average_reward,test_steps,test_actor_loss,test_critic_loss,test_entropy_loss,test_total_loss = test(agent,test_episode,test_episode_reward,test_average_reward,test_steps,test_actor_loss,test_critic_loss,test_entropy_loss,test_total_loss)

        print(all_average_reward)
        agent.save(episode ,ALG_NAME, ENV_ID)
        print("Model has been saved")

        data = {
                    'all_average_reward': all_average_reward,
                    'all_episode_reward': all_episode_reward,
                    'all_actor_loss': all_actor_loss,
                    'all_critic_loss': all_critic_loss,
                    'all_entropy_loss': all_entropy_loss,
                    'all_total_loss': all_total_loss,
                    'all_steps': all_steps,
                } 
        file_path = f'result/{ALG_NAME}_{ENV_ID}_train.txt'
        with open(file_path, 'w') as file:
            json.dump(data, file)
        
#         test_data = {
#                     'all_average_reward': test_average_reward,
#                     'all_episode_reward': test_episode_reward,
#                     'all_actor_loss': test_actor_loss,
#                     'all_critic_loss': test_critic_loss,
#                     'all_entropy_loss': test_entropy_loss,
#                     'all_total_loss': test_total_loss,
#                     'all_steps': test_steps,
#                 } 
#         file_path = f'result/{ALG_NAME}_{ENV_ID}_test.txt'
#         with open(file_path, 'w') as file:
#             json.dump(test_data, file)



# notes

THEORY
https://www.youtube.com/watch?v=HrapVFNBN64&t=651s&ab_channel=EdanMeyer

CODE
https://www.youtube.com/watch?v=hlv79rcHws0&ab_channel=MachineLearningwithPhil

https://stackoverflow.com/questions/46422845/what-is-the-way-to-understand-proximal-policy-optimization-algorithm-in-rl

TRPO > PPO sicne TRPO is inefficient and complicated

issue with TRPO is that the change in policy might be too large causing instability
as such using KL divergence and limit the change by epsilon ensures that the change isnt so large that it causes instability

however, TRPO requires lots of calc. PP) simplifies this by adding the clip function

KL divergence is measure of how different 2 distributions are
delta is the limit to change the new policy as compared to the old policy

notes

- actor critic methods are sensitive to perturbations
- ppo addresses this by limit updates to the policy network
- ppo bases the updates on the ratio of new policy to old
[constraint that update to a specific range to ensure we are not taking very large steps - cause instability to the network]
- there is a need to account for quality/goodness measure of the state (advantage) [we want the network to select profitable states over time] 
- clip loss function and take lower bound using min() [taking advantage may cause loss function to grow very large]
- keeps track of a fixed length of trajectory instead of keeping track of all experiences and generating a subset trajectory at random by sampling
- uses multiple network updates per data smaple using mini batch stochastic gradient ascent

- 2 distinct networks
- critic network evaluates states (not s-a pairs) [e.g. DQN evaluates s-a pairs]
- actor network decides which action to take based on a current state
    - actor network outputs prob_dist (softmax) 
    - exploration due to nature of distribution
    
- memory is fixed to length T (must be much smaller than max steps of episode, maybe 10x smaller)
- track states, actions, rewards, dones, values, log probs
- shuffle memories and sample batches
- perform N epochs of updates on each batch

- objective function L
- L CLI uses advantage but advantage can get large so PPO implements L CLIP to limit the value of L CLI
- ADVANTAGE: tells us the benefit of the new state over the old state [discount factor gamma applied to V(next_state)]
- refer to PPO imgs notes 
- wont implement entropy terms S since we are using 2 diff networks here

https://arxiv.org/pdf/1707.06347.pdf

# algo
-copied from
https://github.com/philtabor/Youtube-Code-Repository/tree/master/ReinforcementLearning/PolicyGradient/PPO/torch

In [4]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
import numpy as np
from mlagents_envs.environment import ActionTuple
import os

In [5]:
# file_name = "C:\\Users\\Palaash.HPZ\\Desktop\\RL-concept-learning_large_build_envs\\build_envs\\windows\\S2 180723\\build"
file_name=r"C:\Users\Palaash.HPZ\Desktop\RL-concept-learning_large_build_envs\build_envs\windows\S0_200723\build"

env =  UE(file_name=file_name,seed=1,side_channels=[],worker_id=4,no_graphics = False)
env.reset()

behavior_names = list(env.behavior_specs.keys())
behavior_value = list(env.behavior_specs.values())
DecisionSteps, TerminalSteps = env.get_steps(behavior_names[0])
agentsNum = len(DecisionSteps.agent_id)

In [6]:
# utils.py

import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

In [7]:
# ppo_torch.py

import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha,
            fc1_dims=256, fc2_dims=256, chkpt_dir='ppo_results'):
        super(ActorNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'actor_torch_ppo')
        self.actor = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, n_actions),
                nn.Softmax(dim=-1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        
        return dist

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=256, fc2_dims=256,
            chkpt_dir='ppo_results'):
        super(CriticNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'critic_torch_ppo')
        self.critic = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, 1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        value = self.critic(state)

        return value

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class Agent:
    def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95,
            policy_clip=0.2, batch_size=64, n_epochs=10):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)
       
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()

    def load_models(self):
        print('... loading models ...')
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()

        probs = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()

        return action, probs, value

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
                            (1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t
            advantage = T.tensor(advantage).to(self.actor.device)

            values = T.tensor(values).to(self.actor.device)
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = T.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                #prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5*critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

        self.memory.clear_memory()               

In [3]:
# main.py

# import gym
import numpy as np
# from ppo_torch import Agent
# from utils import plot_learning_curve

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    N = 20
    batch_size = 5
    n_epochs = 4
    alpha = 0.0003
    agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                    alpha=alpha, n_epochs=n_epochs, 
                    input_dims=env.observation_space.shape)
    n_games = 300

    figure_file = 'ppo_results/plots/cartpole.png'

    best_score = env.reward_range[0]
    score_history = []

    learn_iters = 0
    avg_score = 0
    n_steps = 0

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action, prob, val = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            n_steps += 1
            score += reward
            agent.remember(observation, action, prob, val, reward, done)
            if n_steps % N == 0:
                agent.learn()
                learn_iters += 1
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            agent.save_models()
        
        if (i+1)%15==0:
            print('episode', i+1, 'score %.1f' % score, 'avg score %.1f' % avg_score,
                'time_steps', n_steps, 'learning_steps', learn_iters)
    x = [i+1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history, figure_file)

ModuleNotFoundError: No module named 'gym'