In [1]:
%matplotlib inline
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import numpy as np
import gym

In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
%matplotlib notebook

Finished:
- Baseline BRS with large parameters range: **std & v - \[1e-3, 1\]** (all buffer)
- Baseline BRS with small parameters range **std & v\[1e-3, 10\]** (all buffer)
- Baseline BRS with random sampling (n_samples random samples form buffer) `std=1e-3`
- Baseline BRS with sorted sampling (n_samples sorted samples form buffer)

TODO:
List of things to check:
- VARIABLES:
    - Random seed vs total_mean_reward(TMR)
    - N_ROLLOUTS vs TMR
    - TOTAL_STEPS vs TMR
    - b=2
    - alpha=1e-3 
    - std=1
    - v=1e-3
    - States norm (minmax)

COMPONENTS:
- sample number unsorted vs sorted by reward 
- rollouts
- input data normalization

# Hyper Parameters and Ranges

In [4]:
# PARAMETERS

# Random seed 
SEED = 42
# Number of steps to wait until next update 
N_ROLLOUTS = 100
# Number of total steps of env simulation
TOTAL_STEPS = 1000
# Number of evaluation stapes 
EVAL_STEPS = 1000
#
# ALPHA = 1e-3 
# STD for normal dist. for exploration noise
# STD = 1
# Amount of noise we add (weight multiplyer)
# V = 1e-3

# Parameters ranges
# (min, max, step)
# SEED_RANGE = (0, 1024, 1)
# TOTAL_STEPS_RANGE = (1, 1e+5, 1)
# N_ROLLOUTS_RANGE = (0, TOTAL_STEPS_RANGE[1], 1)
# ALPHA_RANGE = (0, 1, 1e-3)
# STD_RANGE = (0, 1, 1e-3)
# V_RANGE = (0, 1, 1e-3)


# Model
In all experiments will be used one type of model.

In [5]:
class Linear_model:
    def __init__(self, n_state, n_action): 
        # M stands for parameters of this model
        self.M = np.zeros((n_action, n_state))
        
    def __call__(self, state, noise=0):
        action = (self.M + noise) @ state
        return action

In [6]:
class Linear_model_v2:
    def __init__(self, n_state, n_action): 
        # M stands for parameters of this model
        self.M = np.zeros((n_action, n_state))
        self.mu = np.zeros((n_state))
        self.sigma = np.ones(n_state)
        
    def __call__(self, state, noise=0):
        # [(n_action, n_state) + (n_action, n_state)] @ (n_state,n_state) @ [(n_state, 1) - (n_state, 1)]
        _sigma = np.diag(self.sigma)
        action = (self.M + noise) @ _sigma @ (state - self.mu)
        return action

# Optimizers

In [7]:
'''
BRS_v1    (V1 model with simple update)
'''
class BaseRandomSearch:
    def __init__(self, n_state, n_action, std):
        self.policy = Linear_model(n_state,n_action)
        self.std = std
        
        self.buffer = []
        
    def get_actions(self, state, is_train=False):
        if is_train:
            noise = np.random.normal(0, self.std, size=self.policy.M.shape)
            action_pos = self.policy(state,  noise)
            action_neg = self.policy(state, -noise)
            action_pos = self._remap_actions(action_pos)
            action_neg = self._remap_actions(action_neg)
            return action_pos, action_neg, noise
        else:
            action = self.policy(state)
            return self._remap_actions(action)
        
    def learn(self):
        update = np.zeros_like(self.policy.M)
        for step in self.buffer:
            update += ((step[-2] - step[-1]) * step[-3]) / self.std
        
        self.policy.M = self.policy.M + update  
        
        self.buffer = []
        
    def remember(self, memory):
        self.buffer.append(memory)
        
    def _remap_actions(self, action):
        return np.tanh(action)
            

In [8]:
'''
BRS_v1_RS (V1 model with random subset buffer update)
'''
class BRS_RS(BaseRandomSearch):
    def __init__(self, n_state, n_action, std, n_samples):
        self.policy = Linear_model(n_state,n_action)
        self.std = std
        self.n_samples = n_samples
        
        self.buffer = []
        
    def learn(self):
        update = np.zeros_like(self.policy.M)
        
        buffer_idxs = np.random.choice(np.arange(len(self.buffer)), self.n_samples)
        buffer = np.array(self.buffer, dtype=object)[buffer_idxs]
        
        for step in buffer:
            update += ((step[-2] - step[-1]) * step[-3]) / self.std
        
        self.policy.M = self.policy.M + update  
        
        self.buffer = []
        
            

In [9]:
'''
BRS_v1_SS (V1 model with sorted subset buffer update)
'''
class BRS_SS(BRS_RS):
    def _sort_directions(self):
        buffer = np.array(self.buffer, dtype=object)
        b_rewards = buffer[:, -2:].sum(-1)

        # idxs from low to high
        b_idxs = np.argsort(b_rewards)
        b_buffer = buffer[b_idxs][-self.n_samples:]

        return b_buffer

    def learn(self):
        update = np.zeros_like(self.policy.M)
        
        buffer = self._sort_directions()
        
        for step in buffer:
            update += ((step[-2] - step[-1]) * step[-3]) / self.std
        
        self.policy.M = self.policy.M + update  
        
        self.buffer = []
        
            

In [10]:
'''
BRS_v2    (V2 model with simple update)
'''
class BaseRandomSearch_v2:
    def __init__(self, n_state, n_action, std):
        self.policy = Linear_model_v2(n_state,n_action)
        self.std = std
        
        self.buffer = []
        self.states = []
        
    def get_actions(self, state, is_train=False):
        if is_train:
            noise = np.random.normal(0, self.std, size=self.policy.M.shape)
            action_pos = self.policy(state,  noise)
            action_neg = self.policy(state, -noise)
            action_pos = self._remap_actions(action_pos)
            action_neg = self._remap_actions(action_neg)
            return action_pos, action_neg, noise
        else:
            action = self.policy(state)
            return self._remap_actions(action)
        
    def learn(self):
        update = np.zeros_like(self.policy.M)
        for step in self.buffer:
            update += ((step[-2] - step[-1]) * step[-3]) / self.std
        
        self.policy.M = self.policy.M + update  
        
        self.policy.mu = np.mean(self.states, 0)
        self.policy.sigma = np.std(self.states, 0)
        
        self.buffer = []
        
    def remember(self, memory):
        # pos_action, neg_action, noise, pos_reward, neg_reward
        self.buffer.append(memory)
        
    def _remap_actions(self, action):
        return np.tanh(action)
            

In [11]:
'''
BRS_v2_RS (V2 model with random subset buffer update)
'''
class BRS_v2_RS(BaseRandomSearch_v2):
    def __init__(self, n_state, n_action, std, n_samples):
        self.policy = Linear_model_v2(n_state,n_action)
        self.std = std
        self.n_samples = n_samples
        
        self.buffer = []
        self.states = []
        
    def learn(self):
        update = np.zeros_like(self.policy.M)
        
        buffer_idxs = np.random.choice(np.arange(len(self.buffer)), self.n_samples)
        buffer = np.array(self.buffer, dtype=object)[buffer_idxs]
        
        for step in buffer:
            update += ((step[-2] - step[-1]) * step[-3]) / self.std
        
        self.policy.M = self.policy.M + update 
        
        self.policy.mu = np.mean(self.states, 0)
        self.policy.sigma = np.std(self.states, 0)
        
        self.buffer = []

In [12]:
'''
BRS_v2_SS (V2 model with sorted subset buffer update)
'''
class BRS_v2_SS(BRS_v2_RS):
    def _sort_directions(self):
        buffer = np.array(self.buffer, dtype=object)
        b_rewards = buffer[:, -2:].sum(-1)

        # idxs from low to high
        b_idxs = np.argsort(b_rewards)
        b_buffer = buffer[b_idxs][-self.n_samples:]

        return b_buffer

    def learn(self):
        update = np.zeros_like(self.policy.M)
        
        buffer = self._sort_directions()
        
        for step in buffer:
            update += ((step[-2] - step[-1]) * step[-3]) / self.std
        
        self.policy.M = self.policy.M + update
        
        self.policy.mu = np.mean(self.states, 0)
        self.policy.sigma = np.std(self.states, 0)
        
        self.buffer = []
        

In [13]:
'''
ARS_v1
'''
class ARS_v1:
    def __init__(self, n_state, n_action, std, n_samples, alpha=1e-3, state_low=None, state_high=None):
        self.policy = Linear_model(n_state,n_action)
        self.std = std
        self.alpha = alpha
        self.n_samples = n_samples
        
        self.buffer = []
        
        self.state_low=state_low
        self.state_high=state_high
        
    def get_actions(self, state, is_train=False):
        if (self.state_high is not None and self.state_low is not None):
            # MINMAX NORM {value - min}/{max - min}
            state = (state - self.state_low)/(self.state_high - self.state_low)
    
        if is_train:
            noise = np.random.normal(0, self.std, size=self.policy.M.shape)
            action_pos = self.policy(state,  noise)
            action_neg = self.policy(state, -noise)
            action_pos = self._remap_actions(action_pos)
            action_neg = self._remap_actions(action_neg)
            return action_pos, action_neg, noise
        else:
            action = self.policy(state)
            return self._remap_actions(action)
    
    def _sort_directions(self):
        buffer = np.array(self.buffer, dtype=object)
        b_rewards = buffer[:, -2:].sum(-1)

        # idxs from low to high
        b_idxs = np.argsort(b_rewards)
        b_buffer = buffer[b_idxs][-self.n_samples:]

        return b_buffer

    def learn(self):
        b_buffer = self._sort_directions()
        update = np.zeros_like(self.policy.M)
        
        for step in b_buffer:
            r_p = step[-2]
            r_n = step[-1]
            noise = step[-3]
            update += ((r_p - r_n) * noise)
        
        reward_std = b_buffer[:, -2:].std()
        norm = self.alpha / (self.n_samples * reward_std)
        
        self.policy.M = self.policy.M + (norm * update)
        
        self.buffer = []
        
    def remember(self, memory):
        # pos_action, neg_action, noise, pos_reward, neg_reward
        self.buffer.append(memory)
        
    def _remap_actions(self, action):
        return np.tanh(action)
            

In [14]:
'''
ARS_v2
'''
class ARS_v2(ARS_v1):
    def __init__(self, n_state, n_action, std, n_samples, alpha=1e-3, state_low=None, state_high=None):
        self.policy = Linear_model_v2(n_state,n_action)
        self.std = std
        self.alpha = alpha
        self.n_samples = n_samples
        
        self.state_low=state_low
        self.state_high=state_high
        
        self.buffer = []
        self.states = []

    def learn(self):
        b_buffer = self._sort_directions()
        update = np.zeros_like(self.policy.M)
        
        for step in b_buffer:
            r_p = step[-2]
            r_n = step[-1]
            noise = step[-3]
            update += ((r_p - r_n) * noise)
        
        reward_std = b_buffer[:, -2:].std()
        if (self.state_high is not None and self.state_low is not None):
            # MINMAX NORM {value - min}/{max - min}
            reward_std = (reward_std - self.state_low)/(self.state_high - self.state_low)
        
        norm = self.alpha / (self.n_samples * reward_std)
        
        self.policy.M = self.policy.M + (norm * update)
        
        self.policy.mu = np.mean(self.states, 0)
        self.policy.sigma = np.std(self.states, 0)
        
        self.buffer = []
    

## Help functions

In [15]:
def train(env, t_model, total_steps, n_rollouts, tseed, version='v1'):
    np.random.seed(tseed)
    try:
        observation = env.reset(seed=tseed)
    except:
        env.seed(tseed)
        observation = env.reset()
        
    for step in range(total_steps):
        for ro in range(n_rollouts):
            pos_action, neg_action, noise = t_model.get_actions(observation, is_train=True)
            pos_observation, pos_reward, pos_done, pos_info = env.step(pos_action)
            neg_observation, neg_reward, neg_done, neg_info = env.step(neg_action)
            
            
            if version == 'v1':
                t_model.remember([pos_action[0], neg_action[0], noise[0], pos_reward, neg_reward])
            elif version == 'v2':
                t_model.states += pos_observation, neg_observation
                t_model.remember([pos_action[0], neg_action[0], noise[0], pos_reward, neg_reward])
            else:
                print(0/0)
                
            coin = np.random.rand() 
#             if coin <= 0.05:
#                 observation = env.reset()
            if coin < 0.5:
                observation = pos_observation 
            elif coin >= 0.5:
                observation = neg_observation
                
            if (pos_done or neg_done):
                observation = env.reset()
            
        t_model.learn()

    env.close()
    return t_model

In [16]:
def evaluate(env, model, eval_steps, eseed):
    np.random.seed(eseed)
    try:
        observation = env.reset(seed=eseed)
    except:
        env.seed(eseed)
        observation = env.reset()
        
    score = 0

    for step in range(eval_steps):
        action = model.get_actions(observation)
        observation, reward, done, info = env.step(action)

        score += reward

        if done:
            observation = env.reset()

    env.close()
    return score

In [17]:
def search_4_params(STD_RANGE, V_RANGE, TOTAL_STEPS, N_ROLLOUTS, SEED, EVAL_STEPS):
    env = gym.make('Pendulum-v1', g=9.81)
    n_action = 1 
    n_state = 3

    monitor = []
    saved_polices = []
    for std in tqdm(np.arange(*STD_RANGE)):
        for v in tqdm(np.arange(*V_RANGE), leave=False):
            model = BaseRandomSearch(n_state, n_action, std=std, v=v)

            trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED)
            score = evaluate(env, trained_model,EVAL_STEPS, SEED)
            monitor.append([std, v, score])
            saved_polices += [model.policy.M]
    monitor = np.array(monitor)
    return monitor, saved_polices

In [18]:
def display_std__v_r(monitor):
    fig = plt.figure(figsize=(10,10))
    ax = plt.axes(projection='3d')
    ax.scatter3D(monitor[:,0], monitor[:,1], monitor[:,2], c=monitor[:,2], cmap='viridis_r')

    ax.set_xlabel('STD')
    ax.set_ylabel('V')
    ax.set_zlabel('Reward')

    plt.show()

# Ablation study

Train BRS

In [19]:
# STD_RANGE = (1e-3, 1, 8e-2)
# V_RANGE = (1e-3, 1, 8e-2)
# monitor_1, sp1 = search_4_params(STD_RANGE, V_RANGE, TOTAL_STEPS, N_ROLLOUTS, SEED, EVAL_STEPS)

# STD_RANGE = (1e-3, 0.02, 1e-3)
# V_RANGE = (1, 110, 10)
# monitor_2, sp2 = search_4_params(STD_RANGE, V_RANGE, TOTAL_STEPS, N_ROLLOUTS, SEED, EVAL_STEPS)

# STD_RANGE = (1e-3, 0.002, 1e-4)
# V_RANGE = (10, 1100, 100)
# monitor_3, sp3 = search_4_params(STD_RANGE, V_RANGE, TOTAL_STEPS, N_ROLLOUTS, SEED, EVAL_STEPS)


In [20]:
# display_std__v_r(monitor_1)

In [21]:
# display_std__v_r(monitor_2)

In [22]:
# display_std__v_r(monitor_3)

In [23]:
# fig, ax = plt.subplots()
# D = [monitor_1[:,2], monitor_2[:,2], monitor_3[:,2]]
# b = ax.boxplot(D)

# ax.set_xlabel('Model')
# ax.set_ylabel('Reward')
# plt.show()

Now we showed that optimal values for parameters on simple BRS model are:  
```
STD = 1.5e-3
V = 100
```  
We can proceed further with those fixed, for reprodusability and purposes of fair comparison.  
Also we can check differences between policies learn by each model.

In [24]:
# print('Sample of learned polices:')
# print('Monitor 1:')
# [print(v) for v in sp1[:3]]
# print('Monitor 2:')
# [print(v) for v in sp2[:3]]
# print('Monitor 3:')
# [print(v) for v in sp3[:3]]

# Actual models testing 

In [28]:
STD = 1
TOTAL_STEPS = 100
N_ROLLOUTS = 100 
N_SAMPLES_UPDATE = N_ROLLOUTS//2

env_id = 1
envs = ['Pendulum-v1', 'LunarLanderContinuous-v2']


np.random.seed(SEED)
RANDOM_EVAL_SEEDS = np.random.randint(10000, size=50)

env = gym.make(envs[env_id])

n_action = env.action_space.shape[0]
n_state = env.observation_space.shape[0]
print(n_action, n_state)

state_high = env.observation_space.high
state_low = env.observation_space.low

ModuleNotFoundError: No module named 'gym.envs.box2d'

In [26]:
!pip install gym[box2d]



## BRS_v1

In [None]:
model = BaseRandomSearch(n_state, n_action, std=STD)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED)

BRS_v1_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    BRS_v1_rewards += [score]

In [None]:
plt.plot(BRS_v1_rewards)

## BRS_v1_RS

In [None]:
model = BRS_RS(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED)

BRS_v1_RS_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    BRS_v1_RS_rewards += [score]

## BRS_v1_SS

In [None]:
model = BRS_SS(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED)

BRS_v1_SS_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    BRS_v1_SS_rewards += [score]

## BRS_v2

In [None]:
model = BaseRandomSearch_v2(n_state, n_action, std=STD)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v2')

BRS_v2_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    BRS_v2_rewards += [score]

## BRS_v2_RS

In [None]:
model = BRS_v2_RS(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v2')

BRS_v2_RS_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    BRS_v2_RS_rewards += [score]

## BRS_v2_SS

In [None]:
model = BRS_v2_SS(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v2')

BRS_v2_SS_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    BRS_v2_SS_rewards += [score]


## ARS_v1

In [None]:
model = ARS_v1(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v1')

ARS_v1_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    ARS_v1_rewards += [score]


## ARS_v2

In [None]:
model = ARS_v2(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v2')

ARS_v2_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    ARS_v2_rewards += [score]


## ARS_v1_norm

In [None]:
model = ARS_v1(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE,
              state_low=state_low, state_high=state_high)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v1')

ARS_v1_norm_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    ARS_v1_norm_rewards += [score]


## ARS_v2_norm

In [None]:
model = ARS_v2(n_state, n_action, std=STD, n_samples=N_SAMPLES_UPDATE,
              state_low=state_low, state_high=state_high)

trained_model = train(env, model, TOTAL_STEPS, N_ROLLOUTS, SEED, version='v2')

ARS_v2_norm_rewards = []
for eseed in tqdm(RANDOM_EVAL_SEEDS):
    score = evaluate(env, trained_model, EVAL_STEPS, int(eseed))
    ARS_v2_norm_rewards += [score]


In [None]:
D = [BRS_v1_rewards, BRS_v1_RS_rewards, BRS_v1_SS_rewards, 
     BRS_v2_rewards, BRS_v2_RS_rewards, BRS_v2_SS_rewards,
     ARS_v1_rewards, ARS_v2_rewards,
     ARS_v1_norm_rewards, ARS_v2_norm_rewards]

labels = ['BRS_v1', 'BRS_v1_RS', 'BRS_v1_SS', 
          'BRS_v2', 'BRS_v2_RS', 'BRS_v2_SS',
          'ARS_v1', 'ARS_v2',
          'ARS_v1_norm', 'ARS_v2_norm']

plt.figure(figsize=(10,5))
for i, d in enumerate(D):
    plt.plot(d, '.-', label=labels[i])

    
plt.ylabel('Reward')
plt.xlabel('Evaluation number')
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,5))
b = ax.boxplot(D)

ax.set_xlabel('Model')
ax.set_ylabel('Reward')
ax.set_title('Evaluation reward scores, for tested models')

ax.set_xticklabels(labels)

plt.show()
