<a href="https://colab.research.google.com/github/joyparikh/ELEN6885_Project/blob/main/dqn_tf-agents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
# This must be run within a Google Colab environment 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [35]:
import sys
# sys.path.append('/content/gdrive/My Drive/RL/.')
sys.path.append('/content/gdrive/My Drive/Rubiks')

In [36]:
# !sudo apt-get update
# !sudo apt-get install -y xvfb ffmpeg freeglut3-dev
# !pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay
!pip install tf-agents[reverb]
# !pip install pyglet



In [37]:
# from __future__ import absolute_import, division, print_function

import base64
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
# import pyvirtualdisplay
import reverb
import os

In [38]:
import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment 
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import PolicySaver
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

In [39]:
num_iterations = 24000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration =  50  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
# learning_rate = 1e-3  # @param {type:"number"}

boundaries = [50000, 100000]
values = [1e-3, (1e-3)/3, 1e-4]
learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries, values)

log_interval = 500  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 2000  # @param {type:"integer"}

discount = 0.98

In [40]:
import gym.spaces
import gym_Rubiks_Cube
import gym

env_name = "RubiksCube-v0"
gym.make(env_name)

env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [41]:
fc_layer_params = (1000,1000, 1000, 300)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=0.002, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.00003, maxval=0.00003),
    bias_initializer=tf.keras.initializers.Constant(-0.00002))
q_net = sequential.Sequential(dense_layers + [q_values_layer])

In [42]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter,
    gamma = discount)

agent.initialize()

eval_policy = agent.policy
collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())


In [43]:
table_name = 'uniform_table'
replay_buffer_signature = tensor_spec.from_spec(
      agent.collect_data_spec)
replay_buffer_signature = tensor_spec.add_outer_dim(
    replay_buffer_signature)

table = reverb.Table(
    table_name,
    max_size=replay_buffer_max_length,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1),
    signature=replay_buffer_signature)

reverb_server = reverb.Server([table])

replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
    agent.collect_data_spec,
    table_name=table_name,
    sequence_length=2,
    local_server=reverb_server)

rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
  replay_buffer.py_client,
  table_name,
  sequence_length=2)

py_driver.PyDriver(
    env,
    py_tf_eager_policy.PyTFEagerPolicy(
      random_policy, use_tf_function=True),
    [rb_observer],
    max_steps=initial_collect_steps).run(train_py_env.reset())

# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2).prefetch(3)

dataset

iterator = iter(dataset)
print(iterator)

<tensorflow.python.data.ops.iterator_ops.OwnedIterator object at 0x7fbc3bfec610>


In [44]:
def two_move(policy):
  environment = suite_gym.load(env_name)
  environment.setScramble(1, 1, False)
  environment.setStepMax(4)
  environment = tf_py_environment.TFPyEnvironment(environment)

  solved = []

  for action in range(12):
    environment.reset()
    time_step = environment.step(action)
  
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      print(action, action_step.action)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward

    if episode_return >= 1:
      solved.append(action)
  return solved

def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [45]:
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step.
agent.train_step_counter.assign(0)

returns = {i:[0,0] for i in range(9)}

for i in range(1, 9):
  # Reset the environment.
  time_step = train_py_env.reset()

  env = suite_gym.load(env_name)
  env.setScramble(1, i)
  env.setStepMax(3*i + 1)
  eval_env = tf_py_environment.TFPyEnvironment(env)

  # Create a driver to collect experience.
  collect_driver = py_driver.PyDriver(
      env,
      py_tf_eager_policy.PyTFEagerPolicy(
        agent.collect_policy, use_tf_function=True),
      [rb_observer],
      max_steps=collect_steps_per_iteration)

  while returns[i][-2] < 0.93 or returns[i][-1] < 0.93:

    # Collect a few steps and save to the replay buffer.
    time_step, _ = collect_driver.run(time_step)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
      print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
      avg_return = compute_avg_return(eval_env, agent.policy, i*100)
      print('scramble = {0}: step = {1}: Average Return = {2}'.format(i, step, avg_return))
      returns[i].append(avg_return)

      if i == 1:
        solutions = two_move(agent.policy)
        print('step = {0}: Solutions = {1}'.format(step, solutions))
      
      if returns[i][-2] > 0.93 and avg_return > 0.93:
        saver = PolicySaver(agent.collect_policy, batch_size=None)
        saver.save(os.path.join('/content/gdrive/My Drive/Rubiks',f'policy_{i}'))
        break

      

step = 500: loss = 0.06076683849096298
step = 1000: loss = 0.03781650960445404
step = 1500: loss = 0.05192701518535614
step = 2000: loss = 0.04798615723848343
scramble = 1: step = 2000: Average Return = 0.44999998807907104
0 tf.Tensor([0], shape=(1,), dtype=int64)
0 tf.Tensor([10], shape=(1,), dtype=int64)
0 tf.Tensor([10], shape=(1,), dtype=int64)
0 tf.Tensor([10], shape=(1,), dtype=int64)
1 tf.Tensor([10], shape=(1,), dtype=int64)
1 tf.Tensor([4], shape=(1,), dtype=int64)
1 tf.Tensor([10], shape=(1,), dtype=int64)
1 tf.Tensor([4], shape=(1,), dtype=int64)
2 tf.Tensor([0], shape=(1,), dtype=int64)
2 tf.Tensor([0], shape=(1,), dtype=int64)
2 tf.Tensor([0], shape=(1,), dtype=int64)
2 tf.Tensor([0], shape=(1,), dtype=int64)
3 tf.Tensor([9], shape=(1,), dtype=int64)
4 tf.Tensor([4], shape=(1,), dtype=int64)
4 tf.Tensor([4], shape=(1,), dtype=int64)
4 tf.Tensor([4], shape=(1,), dtype=int64)
5 tf.Tensor([0], shape=(1,), dtype=int64)
5 tf.Tensor([0], shape=(1,), dtype=int64)
5 tf.Tensor([0],



7 tf.Tensor([7], shape=(1,), dtype=int64)
7 tf.Tensor([1], shape=(1,), dtype=int64)
8 tf.Tensor([2], shape=(1,), dtype=int64)
9 tf.Tensor([3], shape=(1,), dtype=int64)
10 tf.Tensor([4], shape=(1,), dtype=int64)
11 tf.Tensor([5], shape=(1,), dtype=int64)
step = 8000: Solutions = [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11]




INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_1/assets


INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_1/assets


step = 8500: loss = 0.010713191702961922
step = 9000: loss = 0.009588990360498428
step = 9500: loss = 0.0162886381149292
step = 10000: loss = 0.027351953089237213
scramble = 2: step = 10000: Average Return = 0.824999988079071
step = 10500: loss = 0.027921758592128754
step = 11000: loss = 0.02967274934053421
step = 11500: loss = 0.034893862903118134
step = 12000: loss = 0.03953687846660614
scramble = 2: step = 12000: Average Return = 0.8349999785423279
step = 12500: loss = 0.02025412768125534
step = 13000: loss = 0.016409965232014656
step = 13500: loss = 0.033575497567653656
step = 14000: loss = 0.041828546673059464
scramble = 2: step = 14000: Average Return = 0.875
step = 14500: loss = 0.0159151554107666
step = 15000: loss = 0.007495248690247536
step = 15500: loss = 0.010179983451962471
step = 16000: loss = 0.02602536603808403
scramble = 2: step = 16000: Average Return = 0.9399999976158142
step = 16500: loss = 0.036865442991256714
step = 17000: loss = 0.013967138715088367
step = 17500:



scramble = 2: step = 18000: Average Return = 0.9399999976158142




INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_2/assets


INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_2/assets


step = 18500: loss = 0.011672209948301315
step = 19000: loss = 0.017299098894000053
step = 19500: loss = 0.020323317497968674
step = 20000: loss = 0.03703823685646057
scramble = 3: step = 20000: Average Return = 0.4566666781902313
step = 20500: loss = 0.021304139867424965
step = 21000: loss = 0.013763321563601494
step = 21500: loss = 0.015739407390356064
step = 22000: loss = 0.027103040367364883
scramble = 3: step = 22000: Average Return = 0.8799999952316284
step = 22500: loss = 0.016139136627316475
step = 23000: loss = 0.012004636228084564
step = 23500: loss = 0.019659975543618202
step = 24000: loss = 0.02396976947784424
scramble = 3: step = 24000: Average Return = 0.9333333373069763
step = 24500: loss = 0.02618854120373726
step = 25000: loss = 0.03434820473194122
step = 25500: loss = 0.012148233130574226
step = 26000: loss = 0.011418061330914497
scramble = 3: step = 26000: Average Return = 0.8666666746139526
step = 26500: loss = 0.02549433335661888
step = 27000: loss = 0.006964586675



scramble = 3: step = 30000: Average Return = 0.95333331823349




INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_3/assets


INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_3/assets


step = 30500: loss = 0.03865417093038559
step = 31000: loss = 0.010133224539458752
step = 31500: loss = 0.008871367201209068
step = 32000: loss = 0.0345495231449604
scramble = 4: step = 32000: Average Return = 0.8224999904632568
step = 32500: loss = 0.01809445396065712
step = 33000: loss = 0.034994952380657196
step = 33500: loss = 0.018082264810800552
step = 34000: loss = 0.0180258397012949
scramble = 4: step = 34000: Average Return = 0.8700000047683716
step = 34500: loss = 0.04020567238330841
step = 35000: loss = 0.02343600243330002
step = 35500: loss = 0.020180966705083847
step = 36000: loss = 0.020971914753317833
scramble = 4: step = 36000: Average Return = 0.875
step = 36500: loss = 0.013558254577219486
step = 37000: loss = 0.01824658177793026
step = 37500: loss = 0.03399565443396568
step = 38000: loss = 0.02891034260392189
scramble = 4: step = 38000: Average Return = 0.8274999856948853
step = 38500: loss = 0.013666462153196335
step = 39000: loss = 0.01227203942835331
step = 39500:



scramble = 4: step = 62000: Average Return = 0.9350000023841858




INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_4/assets


INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Rubiks/policy_4/assets


step = 62500: loss = 0.006577222608029842
step = 63000: loss = 0.011899495497345924
step = 63500: loss = 0.004915254656225443
step = 64000: loss = 0.013994383625686169
scramble = 5: step = 64000: Average Return = 0.8360000252723694
step = 64500: loss = 0.011863237246870995
step = 65000: loss = 0.007667264435440302
step = 65500: loss = 0.016197752207517624
step = 66000: loss = 0.010943952947854996
scramble = 5: step = 66000: Average Return = 0.8759999871253967
step = 66500: loss = 0.010921532288193703
step = 67000: loss = 0.018347319215536118
step = 67500: loss = 0.004422393627464771
step = 68000: loss = 0.007972448132932186
scramble = 5: step = 68000: Average Return = 0.8700000047683716
step = 68500: loss = 0.015141978859901428
step = 69000: loss = 0.020199043676257133
step = 69500: loss = 0.018161466345191002
step = 70000: loss = 0.010683323256671429
scramble = 5: step = 70000: Average Return = 0.8679999709129333
step = 70500: loss = 0.008037020452320576
step = 71000: loss = 0.0284623

KeyboardInterrupt: ignored

Attempted approaches:

naive DQN
DQN with increasing scramble number + parameter grid search
DQN with increasing scramble number and max episode length

In [None]:
saved_policy = tf.compat.v2.saved_model.load(os.path.join('/content/gdrive/My Drive/Rubiks','policy_1'))