In [15]:
import gym
env = gym.make('MountainCar-v0')
print(env.reset())
print(env.step(env.action_space.sample()))
# print(env.observation_space.low)
env.close()

[-0.44971572  0.        ]
(array([-0.45126531, -0.0015496 ]), -1.0, False, {})


In [7]:
# Tiling
import random

def create_tiling_grid(low, high, bins=(10, 10), offsets=(0.0, 0.0)):
    assert len(low) == len(high) == len(bins) == len(offsets)
    
    dims = len(low)
    
    split_points = []
    for i in range(dims):
        step = (high[i] - low[i])/bins[i]
        
        split_points.append(np.linspace(low[i]+offsets[i]+step, high[i]+offsets[i], bins[i]-1, False))
                            
    return np.array(split_points)

def tile_encode(sample, high, low, tiling_specs, flatten=False):
    tilings = np.array([create_tiling_grid(low, high, tiling_specs[i][0], tiling_specs[i][1]) for i in range(len(tiling_specs))])
    encoded_sample = [[int(np.digitize(s, g)) for s, g in zip(sample, grid)] for grid in tilings]
    features = []
    for s, spec in zip(encoded_sample, tiling_specs):
        f = np.zeros(spec[0])
        f[tuple(s)] = 1
        features.append(f.flatten())
        
#     print(features)
    return np.concatenate(features)
#     one_hot = [[1 if i == index else 0 for index, bins in zip(sample, spec[0]) for i in range(bins)] for sample, spec in zip(encoded_sample, tiling_specs)]    
#     return np.concatenate(one_hot) if flatten else one_hot

def action_value_delta(state, action, weights, high, low, tiling_specs):
    state_action = (state[0], state[1], action)
    return tile_encode(state_action, high, low, tiling_specs, True)

def action_value_approx(state, action, weights, high, low, tiling_specs):
    # State = (float, float)
    # Action = 0 <= int <= 2
    state_action = (state[0], state[1], action)
    x = tile_encode(state_action, high, low, tiling_specs, True)
    return x * weights

def epsilon_greedy(state, actions, Q, weights, high, low, tiling_specs, epsilon):
    if random.random() < epsilon:
        return random.choice(actions)
    else :
        action_values = [(action, sum(Q(state, action, weights, high, low, tiling_specs))) for action in actions]
        max_action_value = None
        max_actions = []
        for a, v in action_values:
            if max_action_value == None or v > max_action_value:
                max_action_value = v
                max_actions = [a]
            elif v == max_action_value:
                max_actions.append(a)
#         print(action_values)
        return random.choice(max_actions)

# Test with some sample values
# samples = [(-0.2 , 0.067, 1)]
# TILINGS = 8
# tiling_specs = [((TILINGS, TILINGS, 3), (-0.15, -0.015, 0)),
#             ((TILINGS, TILINGS, 3), (0.0, 0.0, 0)),
#             ((TILINGS, TILINGS, 3), (0.15, 0.015, 0))]
# low = [-1.2,  -0.07, 0]
# high = [0.6, 0.07, 2]
# encoded_samples = [tile_encode(sample, high, low, tiling_specs, True) for sample in samples]
# print("\nSamples:", repr(samples), sep="\n")
# print("\nEncoded samples:", repr(encoded_samples), sep="\n")

In [32]:
import numpy as np
import gym
from operator import mul
from tqdm import tqdm

env = gym.make('MountainCar-v0')
env._max_episode_steps = 2000

EPISODES = 1000
NUM_LOGS = 10

Q = action_value_approx
DELTA_Q = action_value_delta
POLICY = epsilon_greedy

ALPHA = 0.5 / 4
EPSILON = 0
GAMMA = 1
ACTIONS = [0, 1, 2]

TILES = (8, 8, 3)
TILINGS = 8
MIN_TILE_OFFSET = (-0.15, -0.015, 0)
MAX_TILE_OFFSET = (0.15, 0.015, 0)
LOW = (-1.2, -0.07, 0)
HIGH = (1.2, 0.07, 2)

w = np.zeros(np.prod(TILES) * TILINGS)
print(w.shape)
tiling_specs = [(TILES, tuple(min_off + (max_off - min_off)*i/(TILINGS-1 if TILINGS > 1 else 1)
                                          for min_off, max_off in zip(MIN_TILE_OFFSET, MAX_TILE_OFFSET))) 
                for i in range(TILINGS)]

total_reward = 0


for e in tqdm(range(EPISODES)):
    s = env.reset()
    a = POLICY(s, ACTIONS, Q, w, HIGH, LOW, tiling_specs, EPSILON)
    done = False
    episode_reward = 0
    step = 0
    while not done:
#         env.render()
        s_prime, r, done, info = env.step(a)
#         done = False
        if done:
            w = w + ALPHA * (r - Q(s, a, w, HIGH, LOW, tiling_specs)) * DELTA_Q(s, a, w, HIGH, LOW, tiling_specs)
        else:
            a_prime = POLICY(s_prime, ACTIONS, Q, w, HIGH, LOW, tiling_specs, EPSILON)
            w = w + ALPHA * (r + GAMMA * sum(Q(s_prime, a_prime, w, HIGH, LOW, tiling_specs)) - sum(Q(s, a, w, HIGH, LOW, tiling_specs))) * DELTA_Q(s, a, w, HIGH, LOW, tiling_specs)
            s = s_prime
            a = a_prime
        
        episode_reward += r
        total_reward += r
        step += 1

        if r != -1:
            print(r)
            print(step)
        
    if e % (EPISODES // NUM_LOGS) == 0:
        print("Episode {} with reward {}. Average reward: {}".format(e, episode_reward, total_reward / (EPISODES // NUM_LOGS)))
        total_reward = 0




  0%|          | 0/1000 [00:00<?, ?it/s][A[A

(1536,)




  0%|          | 1/1000 [00:02<44:45,  2.69s/it][A[A

Episode 0 with reward -822.0. Average reward: -8.22




  0%|          | 2/1000 [00:05<42:54,  2.58s/it][A[A

  0%|          | 3/1000 [00:06<37:01,  2.23s/it][A[A

  0%|          | 4/1000 [00:08<34:08,  2.06s/it][A[A

  0%|          | 5/1000 [00:08<27:06,  1.63s/it][A[A

  1%|          | 6/1000 [00:10<25:39,  1.55s/it][A[A

  1%|          | 7/1000 [00:10<22:24,  1.35s/it][A[A

  1%|          | 8/1000 [00:12<21:22,  1.29s/it][A[A

  1%|          | 9/1000 [00:13<20:25,  1.24s/it][A[A

  1%|          | 10/1000 [00:14<19:06,  1.16s/it][A[A

  1%|          | 11/1000 [00:15<20:12,  1.23s/it][A[A

  1%|          | 12/1000 [00:16<18:22,  1.12s/it][A[A

  1%|▏         | 13/1000 [00:16<15:24,  1.07it/s][A[A

  1%|▏         | 14/1000 [00:17<13:17,  1.24it/s][A[A

  2%|▏         | 15/1000 [00:18<14:31,  1.13it/s][A[A

  2%|▏         | 16/1000 [00:19<13:02,  1.26it/s][A[A

  2%|▏         | 17/1000 [00:19<13:11,  1.24it/s][A[A

  2%|▏         | 18/1000 [00:20<13:05,  1.25it/s][A[A

  2%|▏         | 19/1000 [00:21<12:56

Episode 100 with reward -114.0. Average reward: -205.64




 10%|█         | 102/1000 [01:12<06:19,  2.36it/s][A[A

 10%|█         | 103/1000 [01:13<06:21,  2.35it/s][A[A

 10%|█         | 104/1000 [01:13<06:10,  2.42it/s][A[A

 10%|█         | 105/1000 [01:14<06:20,  2.35it/s][A[A

 11%|█         | 106/1000 [01:14<06:18,  2.36it/s][A[A

 11%|█         | 107/1000 [01:18<21:13,  1.43s/it][A[A

 11%|█         | 108/1000 [01:18<16:36,  1.12s/it][A[A

 11%|█         | 109/1000 [01:19<13:17,  1.12it/s][A[A

 11%|█         | 110/1000 [01:19<10:58,  1.35it/s][A[A

 11%|█         | 111/1000 [01:19<09:33,  1.55it/s][A[A

 11%|█         | 112/1000 [01:20<09:29,  1.56it/s][A[A

 11%|█▏        | 113/1000 [01:21<09:47,  1.51it/s][A[A

 11%|█▏        | 114/1000 [01:21<09:40,  1.53it/s][A[A

 12%|█▏        | 115/1000 [01:22<09:34,  1.54it/s][A[A

 12%|█▏        | 116/1000 [01:23<09:18,  1.58it/s][A[A

 12%|█▏        | 117/1000 [01:23<09:09,  1.61it/s][A[A

 12%|█▏        | 118/1000 [01:24<09:06,  1.61it/s][A[A

 12%|█▏     

Episode 200 with reward -165.0. Average reward: -175.3




 20%|██        | 202/1000 [02:11<07:02,  1.89it/s][A[A

 20%|██        | 203/1000 [02:12<07:14,  1.83it/s][A[A

 20%|██        | 204/1000 [02:12<07:34,  1.75it/s][A[A

 20%|██        | 205/1000 [02:13<07:20,  1.80it/s][A[A

 21%|██        | 206/1000 [02:14<07:19,  1.81it/s][A[A

 21%|██        | 207/1000 [02:14<07:15,  1.82it/s][A[A

 21%|██        | 208/1000 [02:15<07:11,  1.83it/s][A[A

 21%|██        | 209/1000 [02:15<07:13,  1.83it/s][A[A

 21%|██        | 210/1000 [02:16<07:12,  1.83it/s][A[A

 21%|██        | 211/1000 [02:16<07:11,  1.83it/s][A[A

 21%|██        | 212/1000 [02:17<07:28,  1.76it/s][A[A

 21%|██▏       | 213/1000 [02:17<07:22,  1.78it/s][A[A

 21%|██▏       | 214/1000 [02:18<07:32,  1.74it/s][A[A

 22%|██▏       | 215/1000 [02:18<06:43,  1.94it/s][A[A

 22%|██▏       | 216/1000 [02:19<06:41,  1.95it/s][A[A

 22%|██▏       | 217/1000 [02:19<06:08,  2.13it/s][A[A

 22%|██▏       | 218/1000 [02:20<06:25,  2.03it/s][A[A

 22%|██▏    

Episode 300 with reward -113.0. Average reward: -140.79




 30%|███       | 302/1000 [02:58<05:21,  2.17it/s][A[A

 30%|███       | 303/1000 [02:59<05:35,  2.08it/s][A[A

 30%|███       | 304/1000 [02:59<05:41,  2.04it/s][A[A

 30%|███       | 305/1000 [03:00<05:09,  2.25it/s][A[A

 31%|███       | 306/1000 [03:00<04:40,  2.47it/s][A[A

 31%|███       | 307/1000 [03:00<04:29,  2.57it/s][A[A

 31%|███       | 308/1000 [03:01<04:19,  2.67it/s][A[A

 31%|███       | 309/1000 [03:01<04:13,  2.73it/s][A[A

 31%|███       | 310/1000 [03:02<04:09,  2.76it/s][A[A

 31%|███       | 311/1000 [03:02<04:02,  2.84it/s][A[A

 31%|███       | 312/1000 [03:02<04:01,  2.85it/s][A[A

 31%|███▏      | 313/1000 [03:03<04:00,  2.86it/s][A[A

 31%|███▏      | 314/1000 [03:03<03:59,  2.87it/s][A[A

 32%|███▏      | 315/1000 [03:03<03:47,  3.01it/s][A[A

 32%|███▏      | 316/1000 [03:04<03:51,  2.95it/s][A[A

 32%|███▏      | 317/1000 [03:04<03:59,  2.85it/s][A[A

 32%|███▏      | 318/1000 [03:04<04:34,  2.48it/s][A[A

 32%|███▏   

Episode 400 with reward -141.0. Average reward: -144.1




 40%|████      | 402/1000 [03:47<05:10,  1.93it/s][A[A

 40%|████      | 403/1000 [03:47<05:02,  1.97it/s][A[A

 40%|████      | 404/1000 [03:48<05:11,  1.92it/s][A[A

 40%|████      | 405/1000 [03:48<05:25,  1.83it/s][A[A

 41%|████      | 406/1000 [03:49<04:56,  2.00it/s][A[A

 41%|████      | 407/1000 [03:49<04:59,  1.98it/s][A[A

 41%|████      | 408/1000 [03:50<05:13,  1.89it/s][A[A

 41%|████      | 409/1000 [03:50<05:13,  1.89it/s][A[A

 41%|████      | 410/1000 [03:51<05:02,  1.95it/s][A[A

 41%|████      | 411/1000 [03:51<05:01,  1.95it/s][A[A

 41%|████      | 412/1000 [03:52<04:36,  2.13it/s][A[A

 41%|████▏     | 413/1000 [03:52<04:20,  2.26it/s][A[A

 41%|████▏     | 414/1000 [03:52<04:26,  2.20it/s][A[A

 42%|████▏     | 415/1000 [03:53<04:13,  2.30it/s][A[A

 42%|████▏     | 416/1000 [03:53<04:11,  2.32it/s][A[A

 42%|████▏     | 417/1000 [03:54<04:06,  2.37it/s][A[A

 42%|████▏     | 418/1000 [03:54<04:35,  2.12it/s][A[A

 42%|████▏  

Episode 500 with reward -154.0. Average reward: -133.36




 50%|█████     | 502/1000 [04:32<04:13,  1.96it/s][A[A

 50%|█████     | 503/1000 [04:32<04:07,  2.01it/s][A[A

 50%|█████     | 504/1000 [04:33<04:14,  1.95it/s][A[A

 50%|█████     | 505/1000 [04:33<04:43,  1.75it/s][A[A

 51%|█████     | 506/1000 [04:34<04:38,  1.77it/s][A[A

 51%|█████     | 507/1000 [04:34<04:31,  1.82it/s][A[A

 51%|█████     | 508/1000 [04:35<04:33,  1.80it/s][A[A

 51%|█████     | 509/1000 [04:36<05:45,  1.42it/s][A[A

 51%|█████     | 510/1000 [04:37<06:04,  1.35it/s][A[A

 51%|█████     | 511/1000 [04:37<05:26,  1.50it/s][A[A

 51%|█████     | 512/1000 [04:38<05:03,  1.61it/s][A[A

 51%|█████▏    | 513/1000 [04:38<04:46,  1.70it/s][A[A

 51%|█████▏    | 514/1000 [04:39<04:20,  1.87it/s][A[A

 52%|█████▏    | 515/1000 [04:39<04:25,  1.83it/s][A[A

 52%|█████▏    | 516/1000 [04:40<04:24,  1.83it/s][A[A

 52%|█████▏    | 517/1000 [04:40<04:23,  1.83it/s][A[A

 52%|█████▏    | 518/1000 [04:41<04:10,  1.93it/s][A[A

 52%|█████▏ 

Episode 600 with reward -100.0. Average reward: -142.32




 60%|██████    | 602/1000 [05:19<02:22,  2.80it/s][A[A

 60%|██████    | 603/1000 [05:19<02:41,  2.46it/s][A[A

 60%|██████    | 604/1000 [05:20<02:53,  2.28it/s][A[A

 60%|██████    | 605/1000 [05:20<02:42,  2.43it/s][A[A

 61%|██████    | 606/1000 [05:21<02:56,  2.23it/s][A[A

 61%|██████    | 607/1000 [05:21<02:45,  2.38it/s][A[A

 61%|██████    | 608/1000 [05:21<02:55,  2.24it/s][A[A

 61%|██████    | 609/1000 [05:22<02:42,  2.41it/s][A[A

 61%|██████    | 610/1000 [05:22<02:34,  2.53it/s][A[A

 61%|██████    | 611/1000 [05:23<02:41,  2.40it/s][A[A

 61%|██████    | 612/1000 [05:23<02:33,  2.53it/s][A[A

 61%|██████▏   | 613/1000 [05:23<02:49,  2.29it/s][A[A

 61%|██████▏   | 614/1000 [05:24<02:38,  2.44it/s][A[A

 62%|██████▏   | 615/1000 [05:24<02:31,  2.54it/s][A[A

 62%|██████▏   | 616/1000 [05:25<02:44,  2.33it/s][A[A

 62%|██████▏   | 617/1000 [05:25<02:33,  2.50it/s][A[A

 62%|██████▏   | 618/1000 [05:26<02:46,  2.29it/s][A[A

 62%|██████▏

Episode 700 with reward -134.0. Average reward: -136.83




 70%|███████   | 702/1000 [06:04<02:25,  2.05it/s][A[A

 70%|███████   | 703/1000 [06:05<02:22,  2.09it/s][A[A

 70%|███████   | 704/1000 [06:05<02:30,  1.97it/s][A[A

 70%|███████   | 705/1000 [06:06<02:28,  1.98it/s][A[A

 71%|███████   | 706/1000 [06:06<02:21,  2.07it/s][A[A

 71%|███████   | 707/1000 [06:07<02:20,  2.08it/s][A[A

 71%|███████   | 708/1000 [06:07<02:25,  2.01it/s][A[A

 71%|███████   | 709/1000 [06:07<02:15,  2.15it/s][A[A

 71%|███████   | 710/1000 [06:08<02:06,  2.29it/s][A[A

 71%|███████   | 711/1000 [06:08<02:01,  2.38it/s][A[A

 71%|███████   | 712/1000 [06:09<01:59,  2.41it/s][A[A

 71%|███████▏  | 713/1000 [06:09<01:53,  2.52it/s][A[A

 71%|███████▏  | 714/1000 [06:09<01:50,  2.59it/s][A[A

 72%|███████▏  | 715/1000 [06:10<01:46,  2.69it/s][A[A

 72%|███████▏  | 716/1000 [06:10<01:44,  2.71it/s][A[A

 72%|███████▏  | 717/1000 [06:10<01:42,  2.77it/s][A[A

 72%|███████▏  | 718/1000 [06:11<01:39,  2.83it/s][A[A

 72%|███████

Episode 800 with reward -160.0. Average reward: -140.98




 80%|████████  | 802/1000 [06:52<01:48,  1.83it/s][A[A

 80%|████████  | 803/1000 [06:52<01:43,  1.90it/s][A[A

 80%|████████  | 804/1000 [06:53<01:41,  1.94it/s][A[A

 80%|████████  | 805/1000 [06:53<01:44,  1.86it/s][A[A

 81%|████████  | 806/1000 [06:54<01:46,  1.81it/s][A[A

 81%|████████  | 807/1000 [06:54<01:42,  1.88it/s][A[A

 81%|████████  | 808/1000 [06:55<01:43,  1.86it/s][A[A

 81%|████████  | 809/1000 [06:56<01:41,  1.88it/s][A[A

 81%|████████  | 810/1000 [06:56<01:38,  1.94it/s][A[A

 81%|████████  | 811/1000 [06:56<01:28,  2.13it/s][A[A

 81%|████████  | 812/1000 [06:57<01:28,  2.12it/s][A[A

 81%|████████▏ | 813/1000 [06:57<01:32,  2.03it/s][A[A

 81%|████████▏ | 814/1000 [06:58<01:27,  2.13it/s][A[A

 82%|████████▏ | 815/1000 [06:58<01:20,  2.29it/s][A[A

 82%|████████▏ | 816/1000 [06:58<01:15,  2.42it/s][A[A

 82%|████████▏ | 817/1000 [06:59<01:07,  2.70it/s][A[A

 82%|████████▏ | 818/1000 [06:59<01:06,  2.75it/s][A[A

 82%|███████

Episode 900 with reward -137.0. Average reward: -125.88




 90%|█████████ | 902/1000 [07:35<00:45,  2.16it/s][A[A

 90%|█████████ | 903/1000 [07:36<00:48,  2.00it/s][A[A

 90%|█████████ | 904/1000 [07:36<00:50,  1.92it/s][A[A

 90%|█████████ | 905/1000 [07:37<00:49,  1.92it/s][A[A

 91%|█████████ | 906/1000 [07:37<00:49,  1.91it/s][A[A

 91%|█████████ | 907/1000 [07:38<00:43,  2.12it/s][A[A

 91%|█████████ | 908/1000 [07:38<00:40,  2.25it/s][A[A

 91%|█████████ | 909/1000 [07:39<00:41,  2.19it/s][A[A

 91%|█████████ | 910/1000 [07:39<00:39,  2.31it/s][A[A

 91%|█████████ | 911/1000 [07:40<00:41,  2.14it/s][A[A

 91%|█████████ | 912/1000 [07:40<00:38,  2.29it/s][A[A

 91%|█████████▏| 913/1000 [07:40<00:36,  2.41it/s][A[A

 91%|█████████▏| 914/1000 [07:41<00:39,  2.18it/s][A[A

 92%|█████████▏| 915/1000 [07:41<00:40,  2.11it/s][A[A

 92%|█████████▏| 916/1000 [07:42<00:36,  2.27it/s][A[A

 92%|█████████▏| 917/1000 [07:42<00:37,  2.21it/s][A[A

 92%|█████████▏| 918/1000 [07:43<00:34,  2.35it/s][A[A

 92%|███████

In [35]:
val_file_name = "grad-sarsa-out/mc"
import pickle
with open("{}-weights.pickle".format(val_file_name), 'wb') as handle:
    pickle.dump(w, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [38]:
sum(w_save)

-8848.486284904206

In [9]:
from math import floor

def hash_coords(coordinates, m, read_only=False):
    if isinstance(m, int): return hash(tuple(coordinates)) % m
    if m is None: return coordinates

def tiles(iht_or_size, num_tilings, floats, ints=None, read_only=False):
    """returns num-tilings tile indices corresponding to the floats and ints"""
    if ints is None:
        ints = []
    qfloats = [floor(f * num_tilings) for f in floats]
    tiles = []
    for tiling in range(num_tilings):
        tilingX2 = tiling * 2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append((q + b) // num_tilings)
            b += tilingX2
        coords.extend(ints)
        tiles.append(hash_coords(coords, iht_or_size, read_only))
    return tiles

tiles(10, 2, [1.2, 0.02], [1])

[1, 9]