In [1]:
import os
import gym
import json
import copy
import time
import numpy as np 
import itertools

In [2]:
env = gym.make("MountainCar-v0").env
low, high = env.observation_space.low, env.observation_space.high
low, high, env.observation_space.shape[0]

(array([-1.2 , -0.07], dtype=float32), array([0.6 , 0.07], dtype=float32), 2)

In [3]:
env.action_space

Discrete(3)

### State Space Discretization

In [4]:
sample_size, sample_dim = int(5e3), int(env.observation_space.shape[0])

In [5]:
# h-net parameter 
N_h0, N_h1 = 20, 20
N_h = N_h0 * N_h1 

cluster_centers0 = np.arange(low[0], high[0] + 1e-3, (high[0] - low[0] - 1e-3) / float(N_h0 - 1))
cluster_centers1 = np.arange(low[1], high[1] + 1e-3, (high[1] - low[1] - 1e-3) / float(N_h1 - 1))
h_net = np.array(list(itertools.product(cluster_centers0, cluster_centers1)))

# centers_to_delete = np.loadtxt('centers2delete_10_10.csv', delimiter=',')
centers_to_delete = np.loadtxt('centers2delete_20_20_new.csv', delimiter=',')
# centers_to_delete = np.loadtxt('centers2delete_40_50_new.csv', delimiter=',')
for center in centers_to_delete: 
    for index, c in enumerate(h_net):
        if np.all(c == center):
            break
#     print (index, center, h_net[index])
    h_net = np.delete(h_net, index, 0)
h_net.shape

(288, 2)

In [6]:
scale = (high[0] - low[0]) / (high[1] - low[1]) 

def norm(x, scale = scale):
    assert x.shape == (sample_dim,), x.shape
    return np.sqrt(x[0]**2 + (scale * x[1])**2)

In [7]:
action_dim = env.action_space.n 

Z_h = list(itertools.product(h_net, np.arange(action_dim)))
Z_h = np.array([np.concatenate([a, [b]]) for a, b in Z_h])
Z_h.shape

(864, 3)

### Exploration Policy

In [8]:
class ExplorationAgent(object):
    def __init__(self, epsilon = .2):
        self.reset()
        self.ac_space = [0, 1, 2]
        self.epsilon = epsilon
        
    def reset(self):
        self.pos = None 
        self.prev_pos = None
        self.cur_ac = None 
         
    def get_ac(self, obs):
        if self.cur_ac is not None:
            self.prev_pos, self.pos = self.pos, obs[0] 
            if self.cur_ac == 2 and (self.pos - self.prev_pos) < .001:
#                 print ("Changing ac from 2 to 0")
                self.cur_ac = 0 
            elif self.cur_ac == 0 and (self.prev_pos - self.pos) < .001:
#                 print ("Changing ac from 0 to 2")
                self.cur_ac = 2 
        else:
            self.pos = obs[0]
            if self.pos < -0.5:
                self.cur_ac = 2
            else:
                self.cur_ac = 0
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.ac_space)
        return np.random.choice([self.cur_ac, 1])

In [9]:
exploration_agent = ExplorationAgent()

In [10]:
obs = env.reset()
exploration_agent.reset()
rewards = 0 
env.render()
time.sleep(2)
i = 0
startTime = time.time()
while True:
    i += 1
#     ac = env.action_space.sample()
    ac = exploration_agent.get_ac(obs)
    obs, reward, done, _ = env.step(ac) # take a random action
#     print (obs)
    env.render()
    rewards += reward 
    print (reward)
    if done: 
        break
# env.close()
print ("total time: {} sec, rewards: {}, {}".format(time.time() - startTime, rewards, i))
env.close()

-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0


### NNQL 

In [10]:
t, k = 0, 0 
T, K = 2e7, 15
alpha = 1.
gamma = 0.9 
beta = 1. / (1. - gamma)

h = 0.15

Ns = []
cur_N = {}
Q = {} # Q value approximation 
for state_ac_pair in Z_h:
    cur_N[tuple(state_ac_pair)] = 0
    Q[tuple(state_ac_pair)] = 0
num_unfilled = len(cur_N)
    
future_Q = {} # biased estimation of future (/one-step ahead) values

global_close_neighbors_tracker = {}


In [11]:
## Nearest Neighbor Regression
def Gaussian_kernel(x):
    return np.exp(- x**2 / 2.)

def NNR(obs, ac, Q, global_close_neighbors_tracker, mode = 'fr-knn'):
    rst = 0.
    if model == 'fr-knn' or 'kernel-reg':
        close_centers = []
        if tuple(obs) in global_close_neighbors_tracker:
            close_centers = global_close_neighbors_tracker[tuple(obs)]
        else:
            for center in h_net: 
                distance = norm(center - obs)
                if distance < h:
                    close_centers.append(center)
            global_close_neighbors_tracker[tuple(obs)] = close_centers
        num_close_centers = len(close_centers)
        print ("num_close_centers: {}".format(num_close_centers))
    else:
        assert True, 'mode = {} not implemented'.format(mode)
        
    if model == 'kernel-reg':
        denom = 0.
        for center in close_centers:
            distance = norm(center - obs)
            denom += Gaussian_kernel(distance / h)
    else:
        denom = None
        
    if model == 'fr-knn':
        for center in close_centers:
            center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
            rst += Q[center_ac_pair] / float(num_close_centers)
    elif model == 'kernel-reg': 
        assert denom is not None 
        for center in close_centers:
            center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
            distance = norm(center - obs)
            coeff = Gaussian_kernel(distance / h) / denom
            rst += coeff * Q[center_ac_pair]
    return rst 

def NNR_best_ac(obs, Q, global_close_neighbors_tracker, mode = 'fr-knn', return_ac = False):
    if mode == 'fr-knn' or 'kernel-reg':
        num_close_centers = 0 
        close_centers = []
        if tuple(obs) in global_close_neighbors_tracker:
            close_centers = global_close_neighbors_tracker[tuple(obs)]
        else:
            for center in h_net: 
                distance = norm(center - obs)
                if distance < h:
                    close_centers.append(center)
            global_close_neighbors_tracker[tuple(obs)] = close_centers
        num_close_centers = len(close_centers)
#         print ("num_close_centers: {}".format(num_close_centers))
    else:
        assert True, 'mode = {} not implemented'.format(mode)
        
    if mode == 'kernel-reg':
        denom = 0.
        for center in close_centers:
            distance = norm(center - obs)
            denom += Gaussian_kernel(distance / h)
    else:
        denom = None
       
    max_rst, best_ac = -float('inf'), None
    for ac in np.arange(env.action_space.n):
        rst = 0.
        if mode == 'fr-knn':
            for center in close_centers:
                center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
                rst += Q[center_ac_pair] / float(num_close_centers)
        elif mode == 'kernel-reg': 
            assert denom is not None 
            for center in close_centers:
                center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
                distance = norm(center - obs)
                coeff = Gaussian_kernel(distance / h) / denom
                rst += coeff * Q[center_ac_pair]
#                 print ("\t\t {}: Q: {}".format(center_ac_pair, Q[center_ac_pair]))
                
        if rst > max_rst:
            max_rst = rst 
            best_ac = ac 
#         print ("\t ac: {} {}".format(ac, rst))
#     print ("picked ac: {}".format(best_ac))
    if return_ac:
        return max_rst, best_ac
    return max_rst

In [11]:
exp_dir = './training_kernel_reg_h_15e-2_20_20_modified_rw/'

mode = 'kernel-reg'

if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)

log_file = open(os.path.join(exp_dir, "log.txt"), 'w')

obs = env.reset()
exploration_agent.reset()
done = False
num_unchanged = 0
while k < K:
    if done:
        obs = env.reset()
        exploration_agent.reset()
        done = False
    ac = exploration_agent.get_ac(obs)
    next_obs, reward, done, _ = env.step(ac)
    
    reward = ((obs[0] + 0.5)) ** 2 + obs[1] ** 2 * 100.

    if tuple(obs) not in global_close_neighbors_tracker:
        close_centers = []
        for center in h_net: 
            distance = norm(center - obs)
            if distance < h:
                close_centers.append(center)
        global_close_neighbors_tracker[tuple(obs)] = close_centers
    close_centers = global_close_neighbors_tracker[tuple(obs)]
    
    unchanged = True
    sanity_check = False
    # for each centers whose cluster contain 
    # current state within h
    for center in close_centers:
        sanity_check = True 
        center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
        assert center_ac_pair in cur_N
        count = cur_N[center_ac_pair]
        eta = 1. / float(count + 1.)
        if count > 0:
            future_Q[center_ac_pair] = (1 - eta) * future_Q[center_ac_pair] + eta * (reward + gamma * NNR_best_ac(next_obs, Q, global_close_neighbors_tracker, mode = mode))
        else:
            num_unfilled -= 1 
            unchanged = False
            future_Q[center_ac_pair] = reward + gamma * NNR_best_ac(next_obs, Q, global_close_neighbors_tracker, mode = mode)
        cur_N[center_ac_pair] += 1 
#     assert (sanity_check) # assumption in 3.1
    if not sanity_check: 
        done = True 
        continue 
        
    if unchanged:
        num_unchanged += 1
    else:
        num_unchanged = 0 
        
    if t % 1e4 == 0:
        print (t, k, num_unfilled)
        if t % 1e5 == 0 and exp_dir is not None:
            Q_str_keys = {str(k) : Q[k] for k in Q}
            with open(os.path.join(exp_dir, 't{}_Q_values.txt'.format(t)), 'w') as file:
                json.dump(Q_str_keys, file)
#     if num_unchanged >= num_unchanged_termination:
#         break
    if num_unfilled == 0 or num_unchanged >= num_unchanged_termination:
        prev_Q = copy.deepcopy(Q)
        for state_ac_pair in Z_h:
            Q[tuple(state_ac_pair)] = (1. - alpha) * prev_Q[tuple(state_ac_pair)] + alpha * future_Q.get(tuple(state_ac_pair), 0.)
            
            cur_N[tuple(state_ac_pair)] = 0
        num_unfilled = len(cur_N)
        
        if exp_dir is not None:
            Q_str_keys = {str(k) : Q[k] for k in Q}
            with open(os.path.join(exp_dir, 'k{}_Q_values.txt'.format(k)), 'w') as file:
                json.dump(Q_str_keys, file)
        
        log_file.write("k: {} t: {} \n".format(k, t))
        k += 1
        alpha = beta / float(beta + k)
    
    obs = next_obs
    t += 1 
    
log_file.close()

NameError: name 'exploration_agent' is not defined