In [6]:
import numpy as np
import gym
from matplotlib import pyplot as plt

In [7]:
from tiles3 import tiles, IHT  # This is the package for constructing the feature tilings.

In [8]:
def action_selector(actions, values, epsilon):
    """ An epsilon-greedy action selector.
        Give list of actions, assumed values for each action,
            and probability of NOT acting greedily.
    """
    greedy = np.random.choice([1,0], p=[1-epsilon, epsilon])
    if greedy:
        return actions[3 - np.argmax(values[::-1]) - 1] # argmax in reverse to favor right action (not left)
    else:
        return np.random.choice(actions)

In [4]:
class mountain_car_td():
    def __init__(self, n_episodes=100, rewards_list=list()):

        self.n_episodes = n_episodes
        self.n_timesteps = 1500  # Has to be this long to let it win by accident a few times, to start learning.
        self.n_tilings = 10
        self.tiling_dim = 10
        self.max_tile_size = 1024 # ?? how to pick
        self.iht = IHT(self.max_tile_size)

        self.env = gym.make('MountainCar-v0')
        self.env._max_episode_steps = self.n_timesteps
        self.n_action = self.env.action_space.n
        self.obs_highs = self.env.observation_space.high
        self.obs_lows = self.env.observation_space.low
        self.scales = [self.tiling_dim / (self.obs_highs[i] - self.obs_lows[i]) for i in range(len(self.obs_highs))]

        # init weights
        self.theta = np.zeros(self.max_tile_size *  self.n_action)
        # basically a full set of weights per action, all concatenated together
        
        self.rewards_list = rewards_list # a continually growing list of rewards for sim runs

    # Tiling helper function
    # Because the tiles code only divides at integers, we must scale our observations to tiling_dim x tiling_dim space
    # We also must shift to the section of theta corresponding to the considered action.
    def mytiles(self, pt, a):
        """ pt, point in observation space
            a, an action
            Returns featues for this state-action pair
        """
        pt_ = np.array([p*self.scales[i] for i,p in enumerate(list(pt))])
        features = list(np.array(tiles(self.iht, self.n_tilings, pt_)) + a * self.max_tile_size)
        phi = np.zeros_like(self.theta)
        phi[features] = 1
        return phi

    def run_sim(self, alpha0=1, render=False, verbose=False):
        self.__init__(self.n_episodes, self.rewards_list)  # reset all weights and such
        
            # params
        epsilon = .2 # (greedy)
        gamma = .99
        lam = .9
        alpha = alpha0 / 10.
        actions = range(self.env.action_space.n)

        # Run the learning loop
        rewards = []
        
        for i_episode in range(self.n_episodes):
            epsilon *= .92
            epsilon = max(epsilon, .04)
            observation = self.env.reset()

            # Choose an action espilon-greedily
            phis = [self.mytiles(observation, a) for a in actions] # get features for observation
            q_vals = [self.theta.dot(phi) for phi in phis]
            action = action_selector(actions, q_vals, epsilon)
            phi_sa = phis[action]
            q_sa = q_vals[action]

            # initialize e = 0
            e = np.zeros_like(self.theta)

            for t in range(self.n_timesteps):
                if render: self.env.render()

                # Take action, observe change, choose new a, get new features and q_sa_ and do update
                observation, reward, done, info = self.env.step(action)

                # Choose next action epsilon-greedily
                phis = [self.mytiles(observation, a) for a in actions]
                q_vals = [self.theta.dot(phi) for phi in phis]
                action = action_selector(actions, q_vals, epsilon)
                phi_sa_ = phis[action]
                q_sa_ = q_vals[action]

                # Perform updates
                d = reward + gamma * q_sa_ - q_sa
                e = gamma * lam * e + alpha * (1 - gamma * lam * e.dot(phi_sa)) * phi_sa
                self.theta = self.theta + d * e + alpha * (q_sa - self.theta.dot(phi_sa)) * phi_sa


                q_sa = q_sa_
                phi_sa = phi_sa_


                if done:
                    rewards.append(t+1)
                    if verbose: print("Episode finished after {} timesteps".format(t+1))
                    break

        self.env.close()
        self.rewards_list.append(rewards)
        return rewards
    
    def run_all_vals(self, n_runs=50):
        # Return the average reward of the run for a range of alpha values, averaged over n_runs runs
        average_rs = []
        for a0 in np.linspace(.2, 2, 10):
            print(a0)
            for j in range(n_runs):
                self.run_sim(alpha0 = a0)
            r_average = -1 * np.array(self.rewards_list).mean(axis=1).mean()
            average_rs.append(r_average)
            self.rewards_list = []
            
        return average_rs
                
        
        


In [5]:
mc = mountain_car_td(n_episodes=70)
rs = mc.run_sim(verbose=True, render=True)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode finished after 1131 timesteps
Episode finished after 713 timesteps
Episode finished after 571 timesteps
Episode finished after 592 timesteps
Episode finished after 525 timesteps
Episode finished after 245 timesteps
Episode finished after 344 timesteps
Episode finished after 203 timesteps
Episode finished after 199 timesteps
Episode finished after 236 timesteps
Episode finished after 137 timesteps
Episode finished after 209 timesteps
Episode finished after 155 timesteps
Episode finished after 212 timesteps
Episode finished after 115 timesteps
Episode finished after 119 timesteps
Episode finished after 207 timesteps
Episode finished after 201 timesteps
Episode finished after 119 timesteps
Episode finished after 142 timesteps
Episode finished after 196 timesteps
Episod

## Plot effect of learning rate on average rewards over first few episodes

In [None]:
# This will take some time to run, with their parameters.

mc = mountain_car_td(n_episodes=10) # 20 episodes, they did.  Ours might need more, doesn't learn as fast.
average_rs = mc.run_all_vals(n_runs=5) # 100 runs...  or fewer.

In [None]:
plt.plot(np.linspace(.2, 2, 5), last_rs, color='blue')
plt.scatter(np.linspace(.2, 2, 5), last_rs, color='blue')
plt.xlabel(r'$\alpha_0$')
plt.ylabel('return')
plt.title('Effect of Learning Rate')
# plt.savefig('mc_alpha_returns')
plt.show()

## Plot rewards over time, averaged over some episodes

In [None]:
n_runs = 50
mc = mountain_car_td(n_episodes=30)
for i in range(n_runs)
    _ = mc.run_sim(verbose=True)


In [None]:
rs = -1 * np.array(mc.rewards_list).mean(axis=0)
plt.title('Mountain Car Rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.plot(rs)
plt.savefig('mc_rewards.png')

In [None]:
3 - np.argmax(values[::-1]) - 1

In [None]:
sum(np.random.normal(0, .1, 150))

In [None]:
a = np.array([[1,1],[2,2]])
a.mean(axis=0)