# Stretch Reinforcement Learning with DM_Control and PPO

References:
- Google Deepmind [DM_Control Colab](https://colab.research.google.com/github/google-deepmind/dm_control/blob/main/tutorial.ipynb#scrollTo=JHSvxHiaopDb)


<!-- Internal installation instructions. -->

### Getting Started

## Install the dependencies using

- Run `uv pip install -e ".[rlearning]"`

In [None]:
use_gpu = True

In [None]:
try:
  import google.colab
  RUNNING_IN_COLAB = True
except:
  RUNNING_IN_COLAB = False

if RUNNING_IN_COLAB:
  !pip install -q dm_control

  import distutils.util
  import os
  import subprocess
  if subprocess.run('nvidia-smi').returncode:
    raise RuntimeError(
        'Cannot communicate with GPU. '
        'Make sure you are using a GPU Colab runtime. '
        'Go to the Runtime menu and select Choose runtime type.')

  # Add an ICD config so that glvnd can pick up the Nvidia EGL driver.
  # This is usually installed as part of an Nvidia driver package, but the Colab
  # kernel doesn't install its driver via APT, and as a result the ICD is missing.
  # (https://github.com/NVIDIA/libglvnd/blob/master/src/EGL/icd_enumeration.md)
  NVIDIA_ICD_CONFIG_PATH = '/usr/share/glvnd/egl_vendor.d/10_nvidia.json'
  if not os.path.exists(NVIDIA_ICD_CONFIG_PATH):
    with open(NVIDIA_ICD_CONFIG_PATH, 'w') as f:
      f.write("""{
      "file_format_version" : "1.0.0",
      "ICD" : {
          "library_path" : "libEGL_nvidia.so.0"
      }
  }
  """)

  print('Installing dm_control...')

  # Configure dm_control to use the EGL rendering backend (requires GPU)
  %env MUJOCO_GL=egl

  print('Checking that the dm_control installation succeeded...')
  try:
    from dm_control import suite
    env = suite.load('cartpole', 'swingup')
    pixels = env.physics.render()
  except Exception as e:
    raise e from RuntimeError(
        'Something went wrong during installation. Check the shell output above '
        'for more information.\n'
        'If using a hosted Colab runtime, make sure you enable GPU acceleration '
        'by going to the Runtime menu and selecting "Choose runtime type".')
  else:
    del pixels, suite

  !echo Installed dm_control $(pip show dm_control | grep -Po "(?<=Version: ).+")

In [None]:
#@title All `dm_control` imports required for this tutorial

# The basic mujoco wrapper.
from dm_control import mujoco

# Access to enums and MuJoCo library functions.
from dm_control.mujoco.wrapper.mjbindings import enums
from dm_control.mujoco.wrapper.mjbindings import mjlib

import torch

In [None]:
# Init a device with cuda or mps so that it can train faster
import platform
from typing import Literal


device: Literal['cuda'] | Literal['mps'] | Literal['cpu'] = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

device = device if use_gpu else "cpu"

if use_gpu and platform.system() != "Darwin":
  # Configure dm_control to use the EGL rendering backend (requires GPU)
  %env MUJOCO_GL=egl

print(f"Using {device} device")

In [None]:
# From the Google Deepmind dm_control colab notebook:
# General
import copy
import os
import itertools
from IPython.display import clear_output
import numpy as np

# Graphics-related
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from IPython.display import HTML, display
import PIL.Image
# Internal loading of video libraries.

# Use svg backend for figure rendering
%config InlineBackend.figure_format = 'svg'

%matplotlib notebook

# Font sizes
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


def display_video(frames, framerate=30):
  height, width, _ = frames[0].shape
  dpi = 70
  orig_backend = matplotlib.get_backend()
  matplotlib.use('Agg')  # Switch to headless 'Agg' to inhibit figure rendering.
  fig, ax = plt.subplots(1, 1, figsize=(width / dpi, height / dpi), dpi=dpi)
  matplotlib.use(orig_backend)  # Switch back to the original backend.
  ax.set_axis_off()
  ax.set_aspect('equal')
  ax.set_position([0, 0, 1, 1])
  im = ax.imshow(frames[0])
  def update(frame):
    im.set_data(frame)
    return [im]
  interval = 1000/framerate
  anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                  interval=interval, blit=True, repeat=False)
  return HTML(anim.to_html5_video())

# Seed numpy's global RNG so that cell outputs are deterministic. We also try to
# use RandomState instances that are local to a single cell wherever possible.
np.random.seed(42)

In [None]:
import time
import cv2
from stretch_mujoco.enums.actuators import Actuators

scene_option = mujoco.wrapper.core.MjvOption()
scene_option.flags[enums.mjtVisFlag.mjVIS_JOINT] = True

arm_joints = ['joint_arm_l0', 'joint_arm_l1', 'joint_arm_l2', 'joint_arm_l3', 'joint_gripper_slide', 'joint_lift', 'joint_wrist_pitch', 'joint_wrist_roll', 'joint_wrist_yaw']
finger_joints = ['joint_gripper_finger_left_open', 'joint_gripper_finger_right_open',] #'rubber_left_x', 'rubber_left_y', 'rubber_right_x', 'rubber_right_y']

class StretchPushCubeTraining:
    def __init__(self, physics: mujoco.Physics, push_cube_by=np.array([0.5, 0.0, 00])):
        self.physics = physics
        self.target_position =  self._get_cube_pos() + push_cube_by

        # Define state size: joint positions, joint velocities, 3 object1 position
        self.state_size = len(arm_joints) * 2 + 3
        
        # Define action size: 7 continuous joint actions
        self.action_size = len(arm_joints)  # num joints to control

        self.frames = []
        self.render_rate = 1/30 #1/Hz
        self.time_last_render = time.perf_counter()
        self.last_step_time = time.perf_counter()

        self.current_distance_to_target = float('inf')

    def _get_cube_id(self):
        return self.physics.model.name2id("object1", "body")
    def _get_cube_pos(self):
        return self.physics.data.xpos[self._get_cube_id()]
    def _get_cube_original_pos(self):
        return self.physics.model.body("object1").pos
    
    def arm_joint_pos(self):
        return self.physics.named.data.qpos[arm_joints]
    def arm_joint_vel(self):
        return self.physics.named.data.qvel[arm_joints]
    
    def reset(self, use_home_pose = True):
        # Reset the simulation
        self.frames = []
        
        self.physics.reset(0 if use_home_pose else None)

        if use_home_pose:
            #Reset isn't working, so we're gonna go there manually:
            self.physics.data.ctrl = self.physics.model.keyframe("home").ctrl
            for x in range(400):
                self.physics.step()
                self.render()
        
        self.current_distance_to_target = float('inf')

        return np.concatenate([self.arm_joint_pos(), self.arm_joint_vel(), self._get_cube_original_pos()])
        

    def reward(self):
        # Calculate the reward (negative distance to target position of object1)
        object_pos = self._get_cube_pos()
        self.current_distance_to_target = np.linalg.norm(object_pos - self.target_position)
        return -self.current_distance_to_target  # Negative because we want to minimize the distance

    def check_is_done(self):
        return self.current_distance_to_target < 0.05  # Done if the object is close enough to the target

    def step(self, action):

        time_until_next_step = self.physics.model.opt.timestep - (time.perf_counter() - self.last_step_time)
        if time_until_next_step > 0:
            # Sleep to match the timestep.
            time.sleep(time_until_next_step)

        # Apply the action to the joints
        for index, name in enumerate([j.name for j in Actuators.get_arm_joints()]):
            self.physics.data.actuator(name).ctrl = action[index]
        
        # Step the simulation forward
        self.physics.step()

        self.last_step_time = time.perf_counter()

        # Get the current state (qpos, qvel, object1 position)
        state = np.concatenate([self.arm_joint_pos(), self.arm_joint_vel(), self._get_cube_pos()])

        return state

    def render(self):
        
        elapsed = time.perf_counter() - self.time_last_render
        if elapsed > self.render_rate:
            self.time_last_render = time.perf_counter()

            pixels = self.physics.render(scene_option=scene_option)

            self.frames.append(pixels)



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym


# Hyperparameters
gamma = 0.99  # Discount factor
lr_actor = 0.001  # Actor learning rate
lr_critic = 0.001  # Critic learning rate
clip_ratio = 0.2  # PPO clip ratio
epochs = 10  # Number of optimization epochs
batch_size = 64  # Batch size for optimization

# Actor and Critic networks
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size):
        super(ActorCritic, self).__init__()
        self.dense1 = nn.Linear(state_size, 64)
        self.policy_logits = nn.Linear(64, action_size)
        self.dense2 = nn.Linear(64, 64)
        self.value = nn.Linear(64, 1)

    def forward(self, state):
        x = torch.relu(self.dense1(state))
        logits = self.policy_logits(x)
        value = self.value(x)
        return logits, value

# PPO algorithm
def ppo_loss(old_logits, old_values, advantages, states, actions, returns, optimizer, action_size, model):
    def compute_loss(logits, values, actions, returns):
        actions_onehot = torch.zeros(actions.size(0), action_size)
        actions_onehot.scatter_(1, actions.view(-1, 1), 1.0)

        policy = torch.softmax(logits, dim=1)
        action_probs = torch.sum(actions_onehot * policy, dim=1)
        old_policy = torch.softmax(old_logits, dim=1)
        old_action_probs = torch.sum(actions_onehot * old_policy, dim=1)

        # Policy loss
        ratio = torch.exp(torch.log(action_probs + 1e-10) - torch.log(old_action_probs + 1e-10))
        clipped_ratio = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio)
        policy_loss = -torch.mean(torch.min(ratio * advantages, clipped_ratio * advantages))

        # Value loss
        value_loss = torch.mean((values - returns) ** 2)

        # Entropy bonus (optional)
        entropy_bonus = torch.mean(policy * torch.log(policy + 1e-10))

        total_loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus  # Entropy regularization
        return total_loss

    def get_advantages(returns, values):
        advantages = returns - values
        return (advantages - torch.mean(advantages)) / (torch.std(advantages) + 1e-8)

    def train_step(states, actions, returns, old_logits, old_values):
        model.train()
        logits, values = model(states)
        loss = compute_loss(logits, values, actions, returns)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        return loss

    advantages = get_advantages(returns, old_values)
    for _ in range(epochs):
        loss = train_step(states, actions, returns, old_logits, old_values)
    return loss

# Training Loop
def train2(env:StretchPushCubeTraining, max_episodes = 1000,
max_steps_per_episode = 1000):
    
    # Initialize actor-critic model and optimizer
    model = ActorCritic(env.state_size, env.action_size)
    optimizer = optim.Adam(model.parameters(), lr=lr_actor)


    for episode in range(max_episodes):
        states, actions, rewards, values, returns = [], [], [], [], []
        state = env.reset()
        for step in range(max_steps_per_episode):
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            logits, value = model(state)

            # Sample action from the policy distribution
            action_probs = torch.softmax(logits, dim=1)
            action = torch.multinomial(action_probs, 1).item()
            

            next_state = env.step(action)
            reward = env.reward()

            env.render()

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            values.append(value)

            state = next_state

            if env.check_is_done():
                returns_batch = []
                discounted_sum = 0
                for r in rewards[::-1]:
                    discounted_sum = r + gamma * discounted_sum
                    returns_batch.append(discounted_sum)
                returns_batch.reverse()

                states = torch.cat(states, dim=0)
                actions = torch.tensor(actions, dtype=torch.int64)
                values = torch.cat(values, dim=0)
                returns_batch = torch.tensor(returns_batch, dtype=torch.float32)
                old_logits, _ = model(states)

                loss = ppo_loss(old_logits, values, returns_batch - values.squeeze(), states, actions, returns_batch, optimizer=optimizer, action_size=env.action_size, model=model)
                print(f"Episode: {episode + 1}, Loss: {loss.item()}")

                break
        
        print(f"Episode {episode}: Training step complete.")
        display(display_video(env.frames))



In [None]:

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

# PPO Algorithm Implementation
class PPO:
    def __init__(self, state_size, action_size, hidden_size=64, lr=3e-4, gamma=0.99, lam=0.95, epsilon=0.2):
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size
        self.gamma = gamma
        self.lam = lam
        self.epsilon = epsilon
        
        # Create policy network (actor) and value network (critic)
        self.actor = self.create_network()
        self.critic = self.create_critic()
        
        # Create optimizers for the actor and critic
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

    def create_network(self):
        # Shared fully connected neural network
        return nn.Sequential(
            nn.Linear(self.state_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.action_size),
        )
    def create_critic(self):
        return nn.Sequential(
            layer_init(nn.Linear(self.state_size, self.hidden_size)),
            nn.Tanh(),
            layer_init(nn.Linear(self.hidden_size, self.hidden_size)),
            nn.Tanh(),
            layer_init(nn.Linear(in_features=self.hidden_size, out_features=1), std=1.0),
        )

    def select_action(self, state):
        # Policy network to select actions (continuous actions)
        state_tensor = torch.tensor(state, dtype=torch.float32)
        action = self.actor(state_tensor).detach().numpy()
        return action

    def compute_returns(self, rewards, values, next_value, done):
        # Generalized Advantage Estimation (GAE)
        returns = []
        advantages = []
        advantage = 0
        for reward, value, next_value_ in zip(reversed(rewards), reversed(values), [next_value] + values[:-1]):
            delta = reward + self.gamma * next_value_ * (1 - done) - value
            advantage = delta + self.gamma * self.lam * advantage
            advantages.append(advantage)
            returns.append(advantage + value)
            next_value = next_value_

        return list(reversed(returns)), list(reversed(advantages))

    def update(self, states, actions, returns, advantages):
        
        states_tensor = torch.tensor(states, dtype=torch.float32)
        actions_tensor = torch.tensor(actions, dtype=torch.float32)
        returns_tensor = torch.tensor(returns, dtype=torch.float32)
        advantages_tensor = torch.tensor(advantages, dtype=torch.float32)

        # Compute the current value and action probability
        values = self.critic(states_tensor).squeeze()
        actions_pred = self.actor(states_tensor)

        # Calculate value loss
        value_loss = (returns_tensor - values).pow(2).mean()

        # Calculate policy loss with the clipped objective
        action_log_probs = torch.log(actions_pred + 1e-10)
        old_action_log_probs = torch.log(actions_tensor + 1e-10)
        ratio = torch.exp(action_log_probs - old_action_log_probs)
        surrogate_loss = ratio * advantages_tensor
        clipped_loss = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages_tensor
        policy_loss = -torch.min(surrogate_loss, clipped_loss).mean()

        # Total loss
        loss = policy_loss + value_loss

        # Optimize the networks
        self.actor_optimizer.zero_grad()
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.actor_optimizer.step()
        self.critic_optimizer.step()


In [None]:
# Training Loop
def train(env:StretchPushCubeTraining, agent: PPO, episodes=1000, batch_size=64, steps_per_update=2048):
    for episode in range(episodes):
        states, actions, rewards, values = [], [], [], []
        next_state = env.reset(use_home_pose=True)
        
        for _ in range(steps_per_update):
            state = next_state
            action = agent.select_action(state)
            next_state = env.step(action)
            reward = env.reward()

            env.render()
            
            # Record the trajectory
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            # values.append(agent.critic(torch.tensor(state, dtype=torch.float32)).item())
            values.append(agent.critic(torch.tensor(state, dtype=torch.float32)))

            if env.check_is_done():
                break
        
        # Compute returns and advantages
        next_value = agent.critic(torch.tensor(next_state, dtype=torch.float32)).item()
        returns, advantages = agent.compute_returns(rewards, values, next_value, env.check_is_done())

        # Update the agent
        agent.update(states, actions, returns, advantages)
        
        print(f"Episode {episode}: Training step complete.")
        display(display_video(env.frames))

In [None]:
import torch
import torch.nn as nn
import gym
import torch
import numpy as np
from collections import deque

import torch.optim as optim
import torch
import numpy as np


epsilon = 0.2
l2_rate = 0.001
lambd = 0.98

class Actor(nn.Module):
    def __init__(self,N_S,N_A):
        super(Actor,self).__init__()
        self.fc1 = nn.Linear(N_S,64)
        self.fc2 = nn.Linear(64,64)
        self.sigma = nn.Linear(64,N_A)
        self.mu = nn.Linear(64,N_A)
        self.mu.weight.data.mul_(0.1)
        self.mu.bias.data.mul_(0.0)
        # self.set_init([self.fc1,self.fc2, self.mu, self.sigma])
        self.distribution = torch.distributions.Normal
        
    def set_init(self,layers):
        for layer in layers:
            nn.init.normal_(layer.weight,mean=0.,std=0.1)
            nn.init.constant_(layer.bias,0.)

    def forward(self,s):
        x = torch.tanh(self.fc1(s))
        x = torch.tanh(self.fc2(x))

        mu = self.mu(x)
        log_sigma = self.sigma(x)
        #log_sigma = torch.zeros_like(mu)
        sigma = torch.exp(log_sigma)
        return mu,sigma

    def choose_action(self,s):
        mu,sigma = self.forward(s)
        Pi = self.distribution(mu,sigma)
        return Pi.sample().numpy()

class Critic(nn.Module):
    def __init__(self,N_S):
        super(Critic,self).__init__()
        self.fc1 = nn.Linear(N_S,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,1)
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)
        # self.set_init([self.fc1, self.fc2, self.fc2])

    def set_init(self,layers):
        for layer in layers:
            nn.init.normal_(layer.weight,mean=0.,std=0.1)
            nn.init.constant_(layer.bias,0.)

    def forward(self,s):
        x = torch.tanh(self.fc1(s))
        x = torch.tanh(self.fc2(x))
        values = self.fc3(x)
        return values
    
class Ppo:
    def __init__(self,N_S,N_A):
        self.actor_net =Actor(N_S,N_A)
        self.critic_net = Critic(N_S)
        self.actor_optim = optim.Adam(self.actor_net.parameters(),lr=lr_actor)
        self.critic_optim = optim.Adam(self.critic_net.parameters(),lr=lr_critic,weight_decay=l2_rate)
        self.critic_loss_func = torch.nn.MSELoss()

    def train(self,memory):
        memory = np.array(memory)
        states = torch.tensor(np.vstack(memory[:,0]),dtype=torch.float32)

        actions = torch.tensor(list(memory[:,1]),dtype=torch.float32)
        rewards = torch.tensor(list(memory[:,2]),dtype=torch.float32)
        masks = torch.tensor(list(memory[:,3]),dtype=torch.float32)

        values = self.critic_net(states)

        returns,advants = self.get_gae(rewards,masks,values)
        old_mu,old_std = self.actor_net(states)
        pi = self.actor_net.distribution(old_mu,old_std)

        old_log_prob = pi.log_prob(actions).sum(1,keepdim=True)

        n = len(states)
        arr = np.arange(n)
        for epoch in range(1):
            np.random.shuffle(arr)
            for i in range(n//batch_size):
                b_index = arr[batch_size*i:batch_size*(i+1)]
                b_states = states[b_index]
                b_advants = advants[b_index].unsqueeze(1)
                b_actions = actions[b_index]
                b_returns = returns[b_index].unsqueeze(1)

                mu,std = self.actor_net(b_states)
                pi = self.actor_net.distribution(mu,std)
                new_prob = pi.log_prob(b_actions).sum(1,keepdim=True)
                old_prob = old_log_prob[b_index].detach()

               # KL_penalty = self.kl_divergence(old_mu[b_index],old_std[b_index],mu,std)
                ratio = torch.exp(new_prob-old_prob)

                surrogate_loss = ratio*b_advants
                values = self.critic_net(b_states)

                critic_loss = self.critic_loss_func(values,b_returns)

                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()

                ratio = torch.clamp(ratio,1.0-epsilon,1.0+epsilon)

                clipped_loss =ratio*b_advants

                actor_loss = -torch.min(surrogate_loss,clipped_loss).mean()
                #actor_loss = -(surrogate_loss-beta*KL_penalty).mean()

                self.actor_optim.zero_grad()
                actor_loss.backward()

                self.actor_optim.step()
                
    def kl_divergence(self,old_mu,old_sigma,mu,sigma):

        old_mu = old_mu.detach()
        old_sigma = old_sigma.detach()

        kl = torch.log(old_sigma) - torch.log(sigma) + (old_sigma.pow(2) + (old_mu - mu).pow(2)) / \
             (2.0 * sigma.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)
    
    def get_gae(self,rewards, masks, values):
        rewards = torch.Tensor(rewards)
        masks = torch.Tensor(masks)
        returns = torch.zeros_like(rewards)
        advants = torch.zeros_like(rewards)
        running_returns = 0
        previous_value = 0
        running_advants = 0

        for t in reversed(range(0, len(rewards))):
            running_returns = rewards[t] + gamma * running_returns * masks[t]
            running_tderror = rewards[t] + gamma * previous_value * masks[t] - \
                              values.data[t]
            running_advants = running_tderror + gamma * lambd * \
                              running_advants * masks[t]

            returns[t] = running_returns
            previous_value = values.data[t]
            advants[t] = running_advants
            
        advants = (advants - advants.mean()) / advants.std()
        return returns, advants



torch.manual_seed(500)
np.random.seed(500)

class Nomalize:
    def __init__(self, N_S):
        self.mean = np.zeros((N_S,))
        self.std = np.zeros((N_S, ))
        self.stdd = np.zeros((N_S, ))
        self.n = 0

    def __call__(self, x):
        x = np.asarray(x)
        self.n += 1
        if self.n == 1:
            self.mean = x
        else:
            old_mean = self.mean.copy()
            self.mean = old_mean + (x - old_mean) / self.n
            self.stdd = self.stdd + (x - old_mean) * (x - self.mean)
            
        if self.n > 1:
            self.std = np.sqrt(self.stdd / (self.n - 1))
        else:
            self.std = self.mean

        x = x - self.mean

        x = x / (self.std + 1e-8)

        x = np.clip(x, -5, +5)


        return x


def train3(env:StretchPushCubeTraining, max_episodes,
max_steps_per_episode):
    # References https://github.com/qingshi9974/PPO-pytorch-Mujoco
    
    ppo = Ppo(env.state_size,env.action_size)
    nomalize = Nomalize(env.state_size)
    
    memory = deque()
    scores = []
    for episode in range(max_episodes):
        s = nomalize(env.reset())
        score = 0
        for step in range(max_steps_per_episode):
            a=ppo.actor_net.choose_action(torch.from_numpy(np.array(s).astype(np.float32)).unsqueeze(0))[0]

            next_state = env.step(a)
            reward = -env.reward()

            env.render()

            done = env.check_is_done()


            s_ = nomalize(next_state)

            mask = (1-done)*1
            memory.append([s,a,reward,mask])

            score += reward
            s = s_
            if done:
                break

        
        clear_output(wait=True)

        print(f"Episode {episode}: Training complete. Reward: {score}. Last reward{scores[-1] if scores else 'n/a'}. Steps: {step}. Distance of object: {env.current_distance_to_target}")
        
        display(display_video(env.frames))

        scores.append(score)
        
        ppo.train(memory)







# Do Training

In [None]:
import importlib.resources

models_path = str(importlib.resources.files("stretch_mujoco") / "models")
xml_path = models_path + "/scene.xml"
physics = mujoco.Physics.from_xml_path(xml_path)

physics.data.ctrl = physics.model.keyframe("home").ctrl
for x in range(400):
    physics.step()

pixels = physics.render()
PIL.Image.fromarray(pixels)

In [None]:
print('timestep', physics.model.opt.timestep)
print('gravity', physics.model.opt.gravity)

In [None]:
seconds_of_sim_per_epoch = 5
max_steps_per_episode=int(seconds_of_sim_per_epoch * (1/ physics.model.opt.timestep) / 2)

env = StretchPushCubeTraining(physics)


In [None]:
train3(env, max_episodes=100, max_steps_per_episode=max_steps_per_episode)

In [None]:
# agent = PPO(state_size=env.state_size, action_size=env.action_size)

# train(env, agent, steps_per_update=max_steps_per_episode)