In [52]:
"""
Implements BitFlip env from Andrychowicz et al. '17
(https://arxiv.org/abs/1707.01495)
"""

import gym
import numpy as np

from gym import spaces
from gym.utils import seeding


class BitFlip(gym.GoalEnv):
    """
    Simple BitFlipping environment for goal-conditioned reinforcement
    learning.

    BitFlipEnv is a Goal-conditioned environment where the agent must reach
    a goal state of flipped bits from the current state, only receiving a
    sparse reward of -1 or 0 depending on if the goal state is reached.

    The environment follows the toy motivating example presented in
    Andrychowicz et al. '17, where at each step, the agent can choose
    whether which bit in the string it will flip the value for. The episode
    terminates when either the goal state has been reached, or num_bits steps
    have been taken.
    
    An alternate version of this environment is a 2D action space where the 
    agent can only flip the ith bit, where i is the step count in the episode.

    See Section 3.1 from Andrychowicz et al. for the full description.
    """

    def __init__(self, num_bits=10, select_bit=True, succ_bonus=11):
        """
        Initializes Bit Flipping environment instance
        Args:
          num_bits: length of 1D binary string for state and goal
          flip_mode: whether or not agent can choose which bit to flip
        """
        self.num_bits = num_bits
        self.select_bit = select_bit
        self.succ_bonus = succ_bonus
        self.observation_space = spaces.Dict({
            "observation": spaces.MultiBinary(num_bits),
            "desired_goal": spaces.MultiBinary(num_bits),
            "achieved_goal": spaces.MultiBinary(num_bits)})
        if select_bit:
            self.action_space = spaces.Discrete(num_bits)
        else:
            self.action_space = spaces.Discrete(2)
        self.state, self.goal, self.current_step = None, None, None
        self.done = False
        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.done = False
        self.current_step = 0
        self.state = self.np_random.randint(2, size=self.num_bits)
        self.goal = self.np_random.randint(2, size=self.num_bits)
        return {'observation': self.state,
                'desired_goal': self.goal,
                'achieved_goal': self.state}

    def _is_success(self, achieved_goal, desired_goal):
        return np.all(achieved_goal == desired_goal)

    def step(self, action):
        assert action in self.action_space, 'action {} invalid'.format(action)
        if self.done:
            print('Episode has ended, need to call reset()')
        else:
            select_idx = None
            if self.select_bit:
                select_idx = action
            elif action:
                select_idx = self.current_step
            if select_idx is not None:
                self.state[select_idx] = int(not self.state[select_idx])
            self.current_step += 1
        obs = {'observation': self.state.copy(),
               'desired_goal': self.goal.copy(),
               'achieved_goal': self.state.copy()}
        info = {
            'is_success': self._is_success(obs['achieved_goal'], self.goal),
            'current_step': self.current_step-1
        }
        reward = self.compute_reward(self.state, self.goal, info)
        self.done = done = (self.current_step == self.num_bits) or (reward == 0)
        return obs, reward, done, info

    def compute_reward(self, achieved_goal, desired_goal, info):
        succ = self._is_success(achieved_goal, desired_goal)
        if not self.select_bit:
            succ += self.succ_bonus
        return succ - 1


In [53]:
from stable_baselines import HER, DQN

import gym
import os
import os.path as osp
import time

exp_root = './experiments'
hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
exp_name = 'HER_BitFlip'

exp_dir = osp.join(exp_root, exp_name, hms_time)
os.makedirs(exp_dir)

env = BitFlip(10, False)

model = HER('MlpPolicy', env, DQN, n_sampled_goal=4,
            tensorboard_log=exp_dir,
            goal_selection_strategy='episode',
            verbose=1)

model.learn(int(3e4),)

--------------------------------------
| % time spent exploring  | 67       |
| episodes                | 100      |
| mean 100 episode reward | 100      |
| steps                   | 990      |
| success rate            | 0.0101   |
--------------------------------------
--------------------------------------
| % time spent exploring  | 34       |
| episodes                | 200      |
| mean 100 episode reward | 100      |
| steps                   | 1990     |
| success rate            | 0        |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 300      |
| mean 100 episode reward | 100      |
| steps                   | 2990     |
| success rate            | 0        |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 400      |
| mean 100 episode reward | 100      |
| steps                  

<stable_baselines.deepq.dqn.DQN at 0x7f59eb9aa8d0>

In [5]:
from stable_baselines import HER, DQN, SAC, DDPG, TD3
from stable_baselines.her import GoalSelectionStrategy, HERGoalEnvWrapper
from stable_baselines.common.bit_flipping_env import BitFlippingEnv

N_BITS = 10

model_class = DQN  # works also with SAC, DDPG and TD3

env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)

# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                                                verbose=1)

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 100      |
| mean 100 episode reward | -10      |
| steps                   | 990      |
| success rate            | 0        |
--------------------------------------


In [35]:

model.save("./experiments/her_bit_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./experiments/her_bit_env', env=env)


In [28]:
# Train the model
model.learn(int(1e5))

--------------------------------------
| % time spent exploring  | 90       |
| episodes                | 100      |
| mean 100 episode reward | -10      |
| steps                   | 988      |
| success rate            | 0.0101   |
--------------------------------------
--------------------------------------
| % time spent exploring  | 80       |
| episodes                | 200      |
| mean 100 episode reward | -9.9     |
| steps                   | 1985     |
| success rate            | 0.02     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 70       |
| episodes                | 300      |
| mean 100 episode reward | -9.9     |
| steps                   | 2976     |
| success rate            | 0.01     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 61       |
| episodes                | 400      |
| mean 100 episode reward | -9.7     |
| steps                  

KeyboardInterrupt: 

In [41]:
succ = 0
obs = env.reset()
n_eps = 0
while n_eps < 20:
    action, _ = model.predict(obs)
    obs, reward, done, i = env.step(action)

    if done:
        n_eps += 1
        succ += reward == 0
        obs = env.reset()

In [42]:
succ

20