In [1]:
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

In [2]:
import gym
import os
import sys
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from q_learning import plot_running_avg, plot_cost_to_go, FeatureTransformer

In [3]:
# so you can test different architectures
class HiddenLayer:
    def __init__(self, M1, M2, f=tf.nn.tanh, use_bias= True, zeros= False):
        if zeros:
            W = np.zeros((M1, M2), dtype = np.float32)    
        else: 
            W = tf.random_normal(shape = (M1, M2)) * np.sqrt(2./ M1, dtype = np.float32)
        self.W = tf.Variable(W)

        self.use_bias = use_bias
        if use_bias:
            self.b = tf.Variable(np.zeros(M2).astype(np.float32))
        
        self.f = f
    
    def forward(self, X):
        if self.use_bias:
            a = tf.matmul(X, self.W) + self.b
        else:
            a = tf.matmul(X, self.W)
        return self.f(a)

In [4]:
# approximates pi(a | s)
class PolicyModel:
    def __init__(self, D, ft, hidden_layer_sizes= []):
        self.ft = ft
        
        ### hidden layers ####
        M1 = D
        self.hidden_layers = []
        for M2 in hidden_layer_sizes:
            layer = HiddenLayer(M1, M2)
            self.hidden_layers.append(layer)
            M1 = M2
        
        # final layer mean
        self.mean_layer = HiddenLayer(M1, 1, lambda x: x, use_bias = False, zeros = True)
         
        # final layer variance
        self.stdv_layer = HiddenLayer(M1, 1, tf.nn.softplus, use_bias = False, zeros = False)

        # inputs and targets
        self.X = tf.placeholder(tf.float32, shape= (None, D), name= 'X')
        self.actions = tf.placeholder(tf.float32, shape= (None,), name='actions')
        self.advantages = tf.placeholder(tf.float32, shape= (None,), name= 'advantages')
        
        # get final hidden layer
        Z = self.X
        for layer in self.hidden_layers:
            Z = layer.forward(Z)
            
         
        # calculate output and cost  
        mean = self.mean_layer.forward(Z)
        stdv = self.stdv_layer.forward(Z) + 10e-5 # smoothing
        
        #make then 1-D
        mean = tf.reshape(mean, [-1])
        stdv = tf.reshape(stdv, [-1])
        
        norm = tf.contrib.distributions.Normal(mean, stdv)
        self.predict_op = tf.clip_by_value(norm.sample(), -1, 1)
        
        log_probs= norm.log_prob(self.actions)
        cost = -tf.reduce_sum(self.advantages * log_probs + 0.1*norm.entropy())
        self.train_op = tf.train.AdamOptimizer(1e-3).minimize(cost)
        
        
    def set_session(self, session):
        self.session = session
        
    def partial_fit(self, X, actions, advantages):
        X = np.atleast_2d(X)
        X = self.ft.transform(X)
        
        actions = np.atleast_1d(actions)
        advantages = np.atleast_1d(advantages)
        self.session.run(
            self.train_op,
            feed_dict= {
                self.X: X,
                self.actions: actions,
                self.advantages: advantages,
            }
        )
        
    
    def predict(self, X):
        X = np.atleast_2d(X)
        X = self.ft.transform(X)
        return self.session.run(self.predict_op, feed_dict={self.X: X})
    
    def sample_action(self, X):
        p = self.predict(X)[0]
        return p
    

In [5]:
# approximates V(s)
class ValueModel:
    def __init__(self, D, ft, hidden_layer_sizes= []):
        self.ft = ft
        self.costs = []
        
        #create the graph
        self.layers = []
        M1 = D
        for M2 in hidden_layer_sizes:
            layer = HiddenLayer(M1, M2)
            self.layers.append(layer)
            M1 = M2
            
        # final layer
        layer = HiddenLayer(M1, 1, lambda x: x)
        self.layers.append(layer)
        
        # inputs and targets
        self.X = tf.placeholder(tf.float32, shape= (None, D), name= 'X')
        self.Y = tf.placeholder(tf.float32, shape= (None, ), name= 'Y')
        
        # calculata output and cost
        Z = self.X
        for layer in self.layers:
            Z = layer.forward(Z)
        Y_hat = tf.reshape(Z, [-1]) #the output
        self.predict_op = Y_hat
        
        cost = tf.reduce_sum(tf.square(self.Y - Y_hat))
        self.cost = cost
        self.train_op = tf.train.AdamOptimizer(1e-1).minimize(cost)
        
    def set_session(self, session):
        self.session = session
        
    def partial_fit(self, X, Y):
        X = np.atleast_2d(X)
        X = self.ft.transform(X)
        Y = np.atleast_1d(Y)
        self.session.run(self.train_op, feed_dict= {self.X: X, self.Y: Y})
        cost = self.session.run(self.cost, feed_dict= {self.X: X, self.Y: Y})
        self.costs.append(cost)
        
    def predict(self, X):
        X = np.atleast_2d(X)
        X = self.ft.transform(X)
        return self.session.run(self.predict_op, feed_dict= {self.X: X})
        

In [6]:
def play_one_td(env, pmodel, vmodel, gamma):
    observation = env.reset()
    done = False
    totalreward = 0
    iters = 0
    
    while not done and iters < 2000:
        # if we reach 2000, just quit, don't want this going forever
        # the 200 limit seems a bit early
        action = pmodel.sample_action(observation)
        prev_observation = observation
        observation, reward, done, info = env.step([action])
        
        totalreward += reward   
        
        # update the models
        V_next = vmodel.predict(observation)
        G = reward + gamma * V_next
        advantage = G - vmodel.predict(prev_observation)
        pmodel.partial_fit(prev_observation, action, advantage)
        vmodel.partial_fit(prev_observation, G)
        
        iters +=1
    
    return totalreward, iters

In [7]:
def main():
    env = gym.make('MountainCarContinuous-v0')
    ft = FeatureTransformer(env, n_components= 100)
    D = ft.dimensions
    pmodel = PolicyModel(D, ft, [])
    vmodel = ValueModel(D, ft, [])
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    pmodel.set_session(session)
    vmodel.set_session(session)
    gamma = 0.95
    
    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)
        
        
    N = 50
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma)
        totalrewards[n] = totalreward
        if n % 1 == 0:
            print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, 
                 "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    
    
    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()
    
    plot_running_avg(totalrewards)
    plot_cost_to_go(env, vmodel)

In [None]:
if __name__ == '__main__':
    main()

episode: 0 total reward: 59.7 num steps: 586 avg reward (last 100): 59.7
episode: 1 total reward: -80.4 num steps: 999 avg reward (last 100): -10.3
episode: 2 total reward: 42.2 num steps: 746 avg reward (last 100): 7.2
episode: 3 total reward: 21.9 num steps: 944 avg reward (last 100): 10.9
episode: 4 total reward: 63.6 num steps: 430 avg reward (last 100): 21.4
episode: 5 total reward: 57.5 num steps: 475 avg reward (last 100): 27.4
episode: 6 total reward: 60.6 num steps: 443 avg reward (last 100): 32.2
episode: 7 total reward: 52.0 num steps: 544 avg reward (last 100): 34.6
episode: 8 total reward: 63.5 num steps: 409 avg reward (last 100): 37.9
episode: 9 total reward: 80.2 num steps: 231 avg reward (last 100): 42.1
episode: 10 total reward: 77.4 num steps: 262 avg reward (last 100): 45.3
episode: 11 total reward: 37.8 num steps: 707 avg reward (last 100): 44.7
episode: 12 total reward: 65.5 num steps: 394 avg reward (last 100): 46.3
episode: 13 total reward: 62.9 num steps: 442 a