In [1]:
import math
import random
import numpy as np
import gym
from parkour_env import parkour_env

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"



In [2]:
DEBUG = False
FAST = True
MPS = True
RESOLUTION = (64, 64)
env = parkour_env(resolution=RESOLUTION, map="bridge_hybrid2", debug=DEBUG, fast=FAST, action_set=4)

{'x1': '0', 'y1': '1', 'z1': '0', 'x2': '0', 'y2': '1', 'z2': '3', 'type': 'obsidian'}
{'x1': '0', 'y1': '1', 'z1': '5', 'x2': '0', 'y2': '1', 'z2': '6', 'type': 'obsidian'}
{'x1': '0', 'y1': '1', 'z1': '6', 'x2': '5', 'y2': '1', 'z2': '6', 'type': 'obsidian'}
{'x1': '5', 'y1': '1', 'z1': '6', 'x2': '5', 'y2': '1', 'z2': '7', 'type': 'obsidian'}
{'x1': '5', 'y1': '1', 'z1': '9', 'x2': '5', 'y2': '1', 'z2': '10', 'type': 'obsidian'}
{'x1': '5', 'y1': '2', 'z1': '10', 'x2': '5', 'y2': '2', 'z2': '11', 'type': 'obsidian'}
{'x1': '5', 'y1': '3', 'z1': '11', 'x2': '5', 'y2': '3', 'z2': '13', 'type': 'obsidian'}
{'x1': '5', 'y1': '3', 'z1': '13', 'x2': '2', 'y2': '3', 'z2': '13', 'type': 'obsidian'}
{'x1': '2', 'y1': '3', 'z1': '13', 'x2': '2', 'y2': '3', 'z2': '10', 'type': 'obsidian'}
{'x1': '2', 'y1': '3', 'z1': '10', 'x2': '1', 'y2': '3', 'z2': '10', 'type': 'obsidian'}
{'x1': '-1', 'y1': '3', 'z1': '10', 'x2': '-1', 'y2': '3', 'z2': '13', 'type': 'obsidian'}
{'x1': '-2', 'y1': '3', 'z1'



In [3]:
sample_state = env.observation_space.sample()
sample_action = env.action_space.sample()
image_shape = sample_state['pov'].shape

In [4]:
print("Sample action:", sample_action)
print("Sample pov space", image_shape)

Sample action: 4
Sample pov space (64, 64, 3)


In [5]:
from collections import namedtuple, deque
from itertools import count
import matplotlib
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import torch.distributions as Categorical

device = 'cuda' if torch.cuda.is_available() else 'mps' if (getattr(torch, 'has_mps', False) and MPS) else 'cpu'

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
    import ipywidgets as widgets
plt.ion()
print(device)

cuda


In [6]:
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

class ActorCritic(nn.Module):
    def __init__(self, image_shape, n_actions):
        super(ActorCritic, self).__init__()

        # actor
        self.actor = nn.Sequential(
                        #nn.BatchNorm2d(3),
                        nn.Conv2d(3*4, 32, kernel_size=3, stride=2),
                        # nn.ReLU(),
                        nn.Tanh(),
                        nn.MaxPool2d(3, 1),
                        #nn.BatchNorm2d(16),
                        nn.Conv2d(32, 64, kernel_size=3, stride=2),
                        # nn.ReLU(),
                        nn.Tanh(),
                        nn.MaxPool2d(3, 1),
                        #nn.BatchNorm2d(32),
                        nn.Conv2d(64, 64, kernel_size=3, stride=2),
                        # nn.ReLU(),
                        nn.Tanh(),
                        nn.MaxPool2d(3, 1),
                        nn.Flatten(),
                        nn.Linear(576, 512),
                        nn.Tanh(),
                        nn.Linear(512, n_actions),
                        # nn.Softmax(dim=-1)
                        nn.Softmax(dim=-1)
                    )

        # critic
        self.critic = nn.Sequential(
                        #nn.BatchNorm2d(3),
                        nn.Conv2d(3*4, 32, kernel_size=3, stride=2),
                        # nn.ReLU(),
                        nn.Tanh(),
                        nn.MaxPool2d(3, 1),
                        #nn.BatchNorm2d(16),
                        nn.Conv2d(32, 64, kernel_size=3, stride=2),
                        # nn.ReLU(),
                        nn.Tanh(),
                        nn.MaxPool2d(3, 1),
                        #nn.BatchNorm2d(32),
                        nn.Conv2d(64, 64, kernel_size=3, stride=2),
                        # nn.ReLU(),
                        nn.Tanh(),
                        nn.MaxPool2d(3, 1),
                        nn.Flatten(),
                        nn.Linear(576, 512),
                        nn.Tanh(),
                        nn.Linear(512, 1)
                    )

    def forward(self):
        raise NotImplementedError
    

    def act(self, state):
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    

    def evaluate(self, state, action):
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy

class PPO:
    def __init__(self, image_shape, n_actions, lr_actor, lr_critic, gamma, K_epochs, eps_clip):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(image_shape, n_actions).to(device)
        self.optimizer = torch.optim.AdamW([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(image_shape, n_actions).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()

    def select_action(self, state):
        with torch.no_grad():
            #state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.act(state)
        
        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_logprob)

        return action.item()


    def update(self):
        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
        
        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    
    def save(self, model_path):
        torch.save(self.policy_old.state_dict(), model_path)
   

    def load(self, model_path):
        try:
            self.policy_old.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
            self.policy.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
        except:
            'Failed Loading...'

In [7]:
def extract_state(obs):
    x = torch.tensor(obs['pov'].copy(), dtype=torch.float32)
    #x = torch.permute(x, (2,0,1)).unsqueeze(0) / 255
    x = torch.permute(x, (2,0,1)) / 255  # for queue
    return x
class queue():
    def __init__(self, n_stacked_img, dim, stride=1):
        self.data = []
        self.stride = stride
        self.size = (n_stacked_img - 1) * stride + 1
        for i in range(self.size):
            self.data.append(torch.zeros(dim))
    def push(self, img):
        self.data[:-1] = self.data[1:]
        self.data[-1] = img
    def get(self):
        return torch.cat(self.data[::self.stride]).unsqueeze(0).to(device)
    def fill(self, img):
        for i in range(self.size):
            self.data[i] = img

In [8]:
max_ep_len = 400                    # max timesteps in one episode
max_training_timesteps = int(1e5)   # break training loop if timeteps > max_training_timesteps

save_model_freq = int(100)          # save model frequency (in num timesteps)
model_path = 'ppo_adamw2.pt'

update_timestep = max_ep_len * 4    # update policy every n timesteps
K_epochs = 40                       # update policy for K epochs
eps_clip = 0.2                      # clip parameter for PPO
gamma = 0.99                        # discount factor

lr_actor = 0.0003                   # learning rate for actor network
lr_critic = 0.001                   # learning rate for critic network

random_seed = 0                     # set random seed if required (0 = no random seed)

n_actions = env.n_actions

# initialize a PPO agent
ppo_agent = PPO(image_shape, n_actions, lr_actor, lr_critic, gamma, K_epochs, eps_clip)
#ppo_agent.load(model_path)
success_video = []
reward_list = []
total_epi = 0

In [9]:
import time

renderer = widgets.Output()
display.display(renderer)
plotter = widgets.Output()
display.display(plotter)

num_episodes = 5000
total_time = 0
time_step = 0
que = queue(n_stacked_img=4, dim=(3, RESOLUTION[0], RESOLUTION[1]), stride=4)
for i_episode in range(num_episodes):
    start = time.time()
    obs = env.reset()

    best_reward = -100
    total_reward = 0
    reward = 0
    tmp = extract_state(obs)
    que.fill(tmp)
    history = []
    for t in range(max_ep_len):
        env.render()
        if t % 10 == 0 and FAST and is_ipython:
            with renderer:
                display.clear_output(wait=True)
                plt.imshow( obs['pov'] )
                plt.title(f'{total_epi} step: {t} tot_reward: {total_reward:.2f} time: {total_time/60:.2f} min')
                plt.show()
        
        # select action with policy
        state = extract_state(obs)
        que.push(state)
        action = ppo_agent.select_action(que.get())
        obs, reward, done, info, success = env.step(action)
        total_reward += reward
        history.append((state, action, reward))
        best_reward = max(reward, best_reward)

        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)

        time_step += 1

        # update PPO agent
        if time_step % update_timestep == 0:
            ppo_agent.update()

        # save model weights
        if time_step % save_model_freq == 0 and not DEBUG:
            ppo_agent.save(model_path)

        if done:
            reward_list.append(total_reward)
            if is_ipython:
                with plotter:
                    display.clear_output(wait=True)
                    plt.scatter(range(len(reward_list)), reward_list)
                    plt.show()
            break
    total_epi += 1
    end = time.time()
    end = end - start
    total_time += end
    if success:
        print(f"{i_episode} success")
        success_video.append(history)

Output()

Output()

Resetting environment.




Resetting environment.
Resetting environment.
Resetting environment.


timeout: 

In [None]:
plt.scatter(range(len(reward_list)), reward_list)
plt.show()

In [None]:
import matplotlib.animation as animation
from IPython import display
len(success_video)
play = -7
# Create a plot
fig, ax = plt.subplots(1,1)
t = 0
A = ['forward', 'forward_jump', 'forward_sprint', 'camera_left', 'camera_right']
ims = []
def update(i):
    state, action, reward = success_video[play][i]
    title = ax.set_title(f'step: {i+1} reward: {float(reward):.2f} action: {A[action]}')
    img = torch.permute(state, (1,2,0))*255
    img = np.array(img.numpy(), dtype=int)
    im = ax.imshow(img)
    return im,
anim = animation.FuncAnimation(fig, update, frames=len(success_video[play]), interval=50, repeat=True)
anim.save("PPO_hybrid2-5.mp4", dpi=80)
# plt.show()

In [None]:
if not DEBUG:
    ppo_agent.save(model_path)
print(len(success_video))

In [None]:
class queue():
    def __init__(self, n_stacked_img, dim, stride=1):
        self.data = []
        self.stride = stride
        self.size = (n_stacked_img - 1) * stride + 1
        for i in range(self.size):
            self.data.append(torch.zeros(dim))
    def push(self, img):
        self.data[:-1] = self.data[1:]
        self.data[-1] = img
    def get(self):
        return torch.cat(self.data[::self.stride]).unsqueeze(0).to(device)
    def fill(self, img):
        for i in range(self.size):
            self.data[i] = img