In [1]:
import gymenv_v2
from gymenv_v2 import make_multiple_env
import numpy as np
from config import custom_config, easy_config, hard_config
from layers import Embedding
import tensorflow as tf
from policy import Policy
from rollout import rollout, rollout_envs

import wandb
wandb.login()
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-easy"])

%load_ext autoreload
%autoreload 2

[34m[1mwandb[0m: Currently logged in as: [33mjrferraz[0m ([33morcs4529[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
env = make_multiple_env(**easy_config) 

loading training instances, dir instances/train_10_n60_m60 idx 0
loading training instances, dir instances/train_10_n60_m60 idx 1
loading training instances, dir instances/train_10_n60_m60 idx 2
loading training instances, dir instances/train_10_n60_m60 idx 3
loading training instances, dir instances/train_10_n60_m60 idx 4
loading training instances, dir instances/train_10_n60_m60 idx 5
loading training instances, dir instances/train_10_n60_m60 idx 6
loading training instances, dir instances/train_10_n60_m60 idx 7
loading training instances, dir instances/train_10_n60_m60 idx 8
loading training instances, dir instances/train_10_n60_m60 idx 9


## Params

In [3]:
units = [64, 64]
activations = ['tanh', 'tanh']
lr = 0.001  ### CHANGE
num_episodes = 50
num_trajectories = 10 ### CHANGE
delta_std = 0.01
rollout_length = 1
time_limit = 10 ### CHANGE
gamma = 0.98

## Rollouts

In [38]:
# initialize policy and test
policy = Policy(units, activations)
s = env.reset()
_ = policy.compute_prob(s)
rewards_record = []

In [86]:
# for each episode
w_orig_cons, w_orig_cuts = policy.get_weights()
# epsilon_table = []
epsilon_table_cons, epsilon_table_cuts = [], []
train_rewards_table = []

In [91]:
# for each trajectory
# epsilon = np.random.randn()*delta_std
epsilon_cons = [np.random.randn(*x.shape)*delta_std for x in w_cons]
epsilon_cuts = [np.random.randn(*x.shape)*delta_std for x in w_cuts]
w_new_cons = [x + epsilon for x in w_orig_cons]
w_new_cuts = [x + epsilon for x in w_orig_cons]
policy.set_weights(w_new_cons, w_new_cuts)
rewards, times = rollout_envs(envs=env.envs, policy=policy, num_rollouts=1, rollout_length=rollout_length, gamma=gamma)
epsilon_table_cons.append(epsilon_cons)
epsilon_table_cuts.append(epsilon_cuts)
# epsilon_table.append(epsilon)
train_rewards_table.append(np.mean(rewards))

In [92]:
# acumulate gradients
# epsilon_table = np.array(epsilon_table)
train_rewards_table = np.array(train_rewards_table)
train_rewards_table = (train_rewards_table - np.mean(train_rewards_table))/ (np.std(train_rewards_table) + 1e-8)

grads_cons = []
grads_cuts = []
for j in range(len(w_orig_cons)):
    arr_cons = np.zeros(epsilon_table_cons[0][j].shape)
    arr_cuts = np.zeros(epsilon_table_cuts[0][j].shape)
    for i in range(len(epsilon_table_cons)):
        arr_cons += epsilon_table_cons[i][j] * train_rewards_table[i]
        arr_cuts += epsilon_table_cuts[i][j] * train_rewards_table[i]
    arr_cons /= (len(epsilon_table_cons) * delta_std)
    arr_cuts /= (len(epsilon_table_cuts) * delta_std)
    grads_cons.append(arr_cons)
    grads_cuts.append(arr_cuts)

In [116]:
# assign back original weights and update
w_cons = [w_orig_cons[i] - lr*grads_cons[i] for i in range(len(w_orig_cons))]
w_cuts = [w_orig_cuts[i] - lr*grads_cuts[i] for i in range(len(w_orig_cuts))]

policy.set_weights(w_cons, w_cuts)

In [120]:
# evaluate rewards
evaluated_rewards, _ = rollout_envs(envs=env.envs, policy=policy, num_rollouts=10, rollout_length=time_limit, gamma=gamma)

In [122]:
x = evaluated_rewards
print('mean',np.mean(x),'max',np.max(x),'min',np.min(x),'std',np.std(x))

mean 0.13378659235710277 max 0.8532251308604202 min 0.012964513668684861 std 0.18596293519985257


In [125]:
rewards_record.append(np.mean(evaluated_rewards))

## Running loop over episodes

In [6]:
# initialize policy and test
policy = Policy(units, activations)
s = env.reset()
_ = policy.compute_prob(s)
rewards_record = []

for e in range(num_episodes):
    w_orig_cons, w_orig_cuts = policy.get_weights()
    epsilon_table_cons, epsilon_table_cuts = [], []
    train_rewards_table = []
    for t in range(num_trajectories):
        epsilon_cons = [np.random.randn(*x.shape)*delta_std for x in w_orig_cons]
        epsilon_cuts = [np.random.randn(*x.shape)*delta_std for x in w_orig_cuts]
        w_new_cons = [w_orig_cons[i] + epsilon_cons[i] for i in range(len(w_orig_cons))]
        w_new_cuts = [w_orig_cuts[i] + epsilon_cuts[i] for i in range(len(w_orig_cuts))]
        policy.set_weights(w_new_cons, w_new_cuts)
        rewards, times = rollout_envs(envs=env.envs, policy=policy, num_rollouts=1, rollout_length=rollout_length, gamma=gamma)
        epsilon_table_cons.append(epsilon_cons)
        epsilon_table_cuts.append(epsilon_cuts)
        # epsilon_table.append(epsilon)
        train_rewards_table.append(np.mean(rewards))
    
    train_rewards_table = np.array(train_rewards_table)
    train_rewards_table = (train_rewards_table - np.mean(train_rewards_table))/ (np.std(train_rewards_table) + 1e-8)

    grads_cons = []
    grads_cuts = []
    for j in range(len(w_orig_cons)):
        arr_cons = np.zeros(epsilon_table_cons[0][j].shape)
        arr_cuts = np.zeros(epsilon_table_cuts[0][j].shape)
        for i in range(len(epsilon_table_cons)):
            arr_cons += epsilon_table_cons[i][j] * train_rewards_table[i]
            arr_cuts += epsilon_table_cuts[i][j] * train_rewards_table[i]
        arr_cons /= (len(epsilon_table_cons) * delta_std)
        arr_cuts /= (len(epsilon_table_cuts) * delta_std)
        grads_cons.append(arr_cons)
        grads_cuts.append(arr_cuts)
    
    # assign back original weights and update
    w_cons = [w_orig_cons[i] - lr*grads_cons[i] for i in range(len(w_orig_cons))]
    w_cuts = [w_orig_cuts[i] - lr*grads_cuts[i] for i in range(len(w_orig_cuts))]

    policy.set_weights(w_cons, w_cuts)
    
    # evaluate rewards
    eval_r, _ = rollout_envs(envs=env.envs, policy=policy, num_rollouts=10, rollout_length=time_limit, gamma=gamma)
    print(f"Episode {e}:")
    print('mean',np.mean(eval_r),'max',np.max(eval_r),'min',np.min(eval_r),'std',np.std(eval_r))
    print("")
    rewards_record.append(np.mean(evaluated_rewards))


  


ValueError: operands could not be broadcast together with shapes (61,64) (4,) 

In [9]:
wpsilon_con

(61, 64)