In [1]:
import os
import gym
import json
import copy
import time
import numpy as np 
import itertools

In [2]:
# env = gym.make("MountainCar-v0")
env = gym.make("MountainCar-v0")
low, high = env.observation_space.low, env.observation_space.high
low, high, env.observation_space.shape[0]

(array([-1.2 , -0.07], dtype=float32), array([0.6 , 0.07], dtype=float32), 2)

In [3]:
env.action_space

Discrete(3)

In [4]:
sample_size, sample_dim = int(5e3), int(env.observation_space.shape[0])


In [5]:
# h-net parameter 
N_h0, N_h1 = 20, 20
N_h = N_h0 * N_h1 

cluster_centers0 = np.arange(low[0], high[0] + 1e-3, (high[0] - low[0] - 1e-3) / float(N_h0 - 1))
cluster_centers1 = np.arange(low[1], high[1] + 1e-3, (high[1] - low[1] - 1e-3) / float(N_h1 - 1))
h_net = np.array(list(itertools.product(cluster_centers0, cluster_centers1)))

# centers_to_delete = np.loadtxt('centers2delete_40_50_new.csv', delimiter=',')
centers_to_delete = np.loadtxt('centers2delete_20_20_new.csv', delimiter=',')
for center in centers_to_delete: 
    for index, c in enumerate(h_net):
        if np.all(c == center):
            break
#     print (index, center, h_net[index])
    h_net = np.delete(h_net, index, 0)
h_net.shape

(288, 2)

In [6]:
scale = (high[0] - low[0]) / (high[1] - low[1])

def norm(x, scale = scale):
    assert x.shape == (sample_dim,), x.shape
    return np.sqrt(x[0]**2 + (scale * x[1])**2)

In [7]:
## Nearest Neighbor Regression
def Gaussian_kernel(x):
    return np.exp(- x**2 / 2.)

def NNR(obs, ac, Q, global_close_neighbors_tracker, mode = 'fr-knn'):
    rst = 0.
    if model == 'fr-knn' or 'kernel-reg':
        close_centers = []
        if tuple(obs) in global_close_neighbors_tracker:
            close_centers = global_close_neighbors_tracker[tuple(obs)]
        else:
            for center in h_net: 
                distance = norm(center - obs)
                if distance < h:
                    close_centers.append(center)
            global_close_neighbors_tracker[tuple(obs)] = close_centers
        num_close_centers = len(close_centers)
        print ("num_close_centers: {}".format(num_close_centers))
    else:
        assert True, 'mode = {} not implemented'.format(mode)
        
    if model == 'kernel-reg':
        denom = 0.
        for center in close_centers:
            distance = norm(center - obs)
            denom += Gaussian_kernel(distance / h)
    else:
        denom = None
        
    if model == 'fr-knn':
        for center in close_centers:
            center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
            rst += Q[center_ac_pair] / float(num_close_centers)
    elif model == 'kernel-reg': 
        assert denom is not None 
        for center in close_centers:
            center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
            distance = norm(center - obs)
            coeff = Gaussian_kernel(distance / h) / denom
            rst += coeff * Q[center_ac_pair]
    return rst 

def NNR_best_ac(obs, Q, global_close_neighbors_tracker, mode = 'fr-knn', return_ac = False):
    if mode == 'fr-knn' or 'kernel-reg':
        num_close_centers = 0 
        close_centers = []
        if tuple(obs) in global_close_neighbors_tracker:
            close_centers = global_close_neighbors_tracker[tuple(obs)]
        else:
            for center in h_net: 
                distance = norm(center - obs)
                if distance < h:
                    close_centers.append(center)
            global_close_neighbors_tracker[tuple(obs)] = close_centers
        num_close_centers = len(close_centers)
#         print ("num_close_centers: {}".format(num_close_centers))
    else:
        assert True, 'mode = {} not implemented'.format(mode)
        
    if mode == 'kernel-reg':
        denom = 0.
        for center in close_centers:
            distance = norm(center - obs)
            denom += Gaussian_kernel(distance / h)
    else:
        denom = None
       
    max_rst, best_ac = -float('inf'), None
    for ac in np.arange(env.action_space.n):
        rst = 0.
        if mode == 'fr-knn':
            for center in close_centers:
                center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
                rst += Q[center_ac_pair] / float(num_close_centers)
#                 print ("\t\t {}: Q: {}".format(center_ac_pair, Q[center_ac_pair]))
        elif mode == 'kernel-reg': 
            assert denom is not None 
            for center in close_centers:
                center_ac_pair = tuple(np.concatenate([center, [float(ac)]]))
                distance = norm(center - obs)
                coeff = Gaussian_kernel(distance / h) / denom
                rst += coeff * Q[center_ac_pair]
#                 print ("\t\t {}: Q: {}".format(center_ac_pair, Q[center_ac_pair]))
                
        if rst > max_rst:
            max_rst = rst 
            best_ac = ac 
#         print ("\t ac: {} {}".format(ac, rst))
#     print ("picked ac: {}".format(best_ac))
    if return_ac:
        return max_rst, best_ac
    return max_rst

In [8]:
def eval_N_times(mode, h, N = 100):
    global_close_neighbors_tracker = {}

    results = []
    for i in range(100):

        obs = env.reset()
        rewards = 0 
        i = 0
        while True:
            i += 1
            _, ac = NNR_best_ac(obs, Q, global_close_neighbors_tracker, return_ac = True, mode = mode)
            obs, reward, done, _ = env.step(ac) # take a random action
            rewards += reward 
            if done: 
                break
        results.append(rewards)
    return np.mean(results), np.std(results)

In [9]:
mode = 'kernel-reg'
# mode = 'fr-knn'
h = 0.15

f = open("./training_kernel_reg_h_15e-2_20_20_modified_rw/k1_Q_values.txt")

Q = json.load(f)
Q = {eval(k): Q[k] for k in Q} 

eval_N_times(mode, h, N = 100)

KeyboardInterrupt: 

In [14]:
mode = 'kernel-reg'
# mode = 'fr-knn'
h = 0.15
N = 100

exp_name = "./training_kernel_reg_h_15e-2_20_20_modified_rw"

means = []
stds = []

for k in range(15):
    filename = os.path.join(exp_name, "k{}_Q_values.txt".format(k))
    f = open(filename)
    Q = json.load(f)
    Q = {eval(k): Q[k] for k in Q}
    mean, std = eval_N_times(mode, h, N)
    print ("k: {}, mean: {}, std: {}".format(k, mean, std))
    means.append(mean)
    stds.append(std)

k: 0, mean: -200.0, std: 0.0
k: 1, mean: -146.15, std: 32.47318124237292
k: 2, mean: -147.43, std: 34.528323156504435
k: 3, mean: -147.24, std: 32.80796244816188
k: 4, mean: -150.74, std: 27.58572819412241
k: 5, mean: -141.85, std: 33.713610011388575
k: 6, mean: -149.77, std: 34.46617327177475
k: 7, mean: -155.07, std: 32.82019347901532
k: 8, mean: -153.0, std: 37.01945434497921
k: 9, mean: -156.29, std: 32.921207450517365
k: 10, mean: -152.65, std: 34.10054984893939
k: 11, mean: -149.78, std: 32.12587119441277
k: 12, mean: -149.86, std: 35.259897901156776
k: 13, mean: -162.0, std: 25.649171526581515
k: 14, mean: -153.22, std: 30.502321223146282
