In [1]:
import distutils.util
import subprocess
if subprocess.run('nvidia-smi').returncode:
    raise RuntimeError(
      'Cannot communicate with GPU. '
      'Make sure you are using a GPU Colab runtime. '
      'Go to the Runtime menu and select Choose runtime type.')

print('Installing dm_control...')
#!pip install -q dm_control>=1.0.8

# Configure dm_control to use the EGL rendering backend (requires GPU)
%env MUJOCO_GL=egl

print('Checking that the dm_control installation succeeded...')
try:
    from dm_control import suite
    env = suite.load('cartpole', 'swingup')
    pixels = env.physics.render()
except Exception as e:
    raise e from RuntimeError(
      'Something went wrong during installation. Check the shell output above '
      'for more information.\n'
      'If using a hosted Colab runtime, make sure you enable GPU acceleration '
      'by going to the Runtime menu and selecting "Choose runtime type".')
else:
    del pixels, suite

!echo Installed dm_control $(pip show dm_control | grep -Po "(?<=Version: ).+")

# %pip -q install git+https://github.com/deepmind/acme.git#egg=dm-acme[jax,tf,envs]
# %pip -q install imageio-ffmpeg
# %pip -q install gdown

IMG_HEIGHT = 256
IMG_WIDTH = 256
# Removed unnecessary generated file
! rm -r "=1.0.8"

import os
import random
import math
import pickle
import numpy as np
import tree
# plot the activations
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
import imageio
from PIL import Image
import io

from sklearn.decomposition import PCA


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
from torch.optim import Adam

#Run to install MuJoCo and `dm_control`
# import this first to resolve the issue.
import sys
sys.path.insert(1, '../source/')
from acme import wrappers
from model import *
from utils import *
# Soft-Actor-Critic Model
from sac import *
from replay_memory import *

# try out the wrappers
from acme import wrappers
from dm_control import suite

Sun Feb 12 11:11:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.39.01    Driver Version: 510.39.01    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:18:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#@title Environment wrappers
from dm_env import specs


# environment wrappers
class NormilizeActionSpecWrapper(wrappers.EnvironmentWrapper):
    """Turn each dimension of the actions into the range of [-1, 1]."""

    def __init__(self, environment):
        super().__init__(environment)

        action_spec = environment.action_spec()
        self._scale = action_spec.maximum - action_spec.minimum
        self._offset = action_spec.minimum

        minimum = action_spec.minimum * 0 - 1.
        maximum = action_spec.minimum * 0 + 1.
        self._action_spec = specs.BoundedArray(
            action_spec.shape,
            action_spec.dtype,
            minimum,
            maximum,
            name=action_spec.name)

    def _from_normal_actions(self, actions):
        actions = 0.5 * (actions + 1.0)  # a_t is now in the range [0, 1]
        # scale range to [minimum, maximum]
        return actions * self._scale + self._offset

    def step(self, action):
        action = self._from_normal_actions(action)
        return self._environment.step(action)

    def action_spec(self):
        return self._action_spec


class MujocoActionNormalizer(wrappers.EnvironmentWrapper):
    """Rescale actions to [-1, 1] range for mujoco physics engine.

    For control environments whose actions have bounded range in [-1, 1], this
      adaptor rescale actions to the desired range. This allows actor network to
      output unscaled actions for better gradient dynamics.
    """

    def __init__(self, environment, rescale='clip'):
        super().__init__(environment)
        self._rescale = rescale

    def step(self, action):
        """Rescale actions to [-1, 1] range before stepping wrapped environment."""
        if self._rescale == 'tanh':
            scaled_actions = tree.map_structure(np.tanh, action)
        elif self._rescale == 'clip':
            scaled_actions = tree.map_structure(lambda a: np.clip(a, -1., 1.), action)
        else:
            raise ValueError('Unrecognized scaling option: %s' % self._rescale)
        return self._environment.step(scaled_actions)

In [8]:
# load the environment
env = suite.load(domain_name="walker", task_name="walk")
# add wrappers onto the environment
env = NormilizeActionSpecWrapper(env)
env = MujocoActionNormalizer(environment=env, rescale='clip')
env = wrappers.SinglePrecisionWrapper(env)


class Args:
    env_name = 'whatever'
    policy = 'Gaussian'
    eval = True
    gamma = 0.99
    tau = 0.005
    lr = 0.0003
    alpha = 0.2
    automatic_entropy_tuning = True
    seed = 42
    batch_size = 512
    num_steps = 1000000
    hidden_size = 1024
    updates_per_step = 1
    start_steps = 10000
    target_update_interval = 1
    replay_size = 1000000
    cuda = True


args = Args()

# get the dimensionality of the observation_spec after flattening
print(env.observation_spec())
flat_obs = tree.flatten(env.observation_spec())
#flat_obs = flat_obs[1:]
print(flat_obs[1:])
# combine all the shapes
#obs_dim = sum([item.shape[0] for item in flat_obs])
obs_dim = 0
for i in flat_obs:
    try:
        obs_dim += i.shape[0]
    except IndexError:
        obs_dim += 1
# setup agent, using Soft-Actor-Critic Model
agent = SAC(obs_dim, env.action_spec(), args)
# load checkpoint - UPLOAD YOUR FILE HERE!
model_path = 'sac_checkpoint_walker_walk_batch512_hidden1024_1123_500'
agent.load_checkpoint(model_path, evaluate=True)

# pull out model
model = agent.policy
# setup hook dict
hook_dict = init_hook_dict(model)
# add hooks
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        module.register_forward_hook(recordtodict_hook(name=name, hook_dict=hook_dict))

OrderedDict([('orientations', Array(shape=(14,), dtype=dtype('float32'), name='orientations')), ('height', Array(shape=(), dtype=dtype('float32'), name='height')), ('velocity', Array(shape=(9,), dtype=dtype('float32'), name='velocity'))])
[Array(shape=(14,), dtype=dtype('float32'), name='orientations'), Array(shape=(9,), dtype=dtype('float32'), name='velocity')]
Loading models from sac_checkpoint_walker_walk_batch512_hidden1024_1123_500


In [9]:
flat_obs

[Array(shape=(), dtype=dtype('float32'), name='height'),
 Array(shape=(14,), dtype=dtype('float32'), name='orientations'),
 Array(shape=(9,), dtype=dtype('float32'), name='velocity')]

In [10]:
# run a few episodes just to collect activations
num_episodes_to_run = 42

rewards = []
states = []
actions = []
for i in range(num_episodes_to_run):
    time_step = env.reset()
    episode_reward = 0
    
    while not time_step.last():  # or env.get_termination()
        # get the state
        #state = get_flat_obs(time_step)
        flat_obs = tree.flatten(time_step.observation)
        flat_obs[0] = flat_obs[0].reshape(-1,1)[0]
        state = np.concatenate(flat_obs)
        # sample an action
        action = agent.select_action(state)
        time_step = env.step(action)

        # record reward
        time_step_reward = time_step.reward
        rewards.append(time_step_reward)
        episode_reward += time_step_reward
        
        #record states and actions
        states.append(state)
        actions.append(action)
    if i % 10 == 0:
        print('Episode: {} Reward: {}'.format(i+1, episode_reward))

Episode: 1 Reward: 956.8336200937629
Episode: 11 Reward: 932.5251256353222
Episode: 21 Reward: 953.6426235754043
Episode: 31 Reward: 982.009455576539
Episode: 41 Reward: 939.4821482943371
