In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
import sklearn
import sklearn.preprocessing

# Create environment
env = gym.make("MountainCarContinuous-v0")


In [8]:

# Sample from state space for state normalization
state_space_samples = np.array([env.observation_space.sample() for _ in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)


class PolicyNetwork(nn.Module):
    def __init__(self, input_dims, n_actions):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, 40)
        self.fc2 = nn.Linear(40, 40)
        self.mu = nn.Linear(40, n_actions)
        self.sigma = nn.Linear(40, n_actions)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        mu = self.mu(x)
        sigma = torch.nn.functional.softplus(self.sigma(x)) + 1e-5
        return mu, sigma


class ValueNetwork(nn.Module):
    def __init__(self, input_dims):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, 400)
        self.fc2 = nn.Linear(400, 400)
        self.V = nn.Linear(400, 1)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        V = self.V(x)
        return V


# Function to normalize states
def scale_state(state):
    scaled = scaler.transform([state])
    return scaled[0]


# Hyperparameters
lr_actor = 0.00002
lr_critic = 0.001
gamma = 0.99
num_episodes = 300
input_dims = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

# Initialize networks and optimizers
policy_net = PolicyNetwork(input_dims, n_actions)
value_net = ValueNetwork(input_dims)
optimizer_actor = optim.Adam(policy_net.parameters(), lr=lr_actor)
optimizer_critic = optim.Adam(value_net.parameters(), lr=lr_critic)

# Training loop
episode_history = []
for episode in range(num_episodes):
    state = env.reset()
    reward_total = 0
    steps = 0
    done = False
    while not done:
        # Sample action according to current policy
        state_tensor = torch.tensor(scale_state(state), dtype=torch.float32)
        mu, sigma = policy_net(state_tensor)
        norm_dist = torch.distributions.Normal(mu, sigma)
        action_tensor = norm_dist.sample()
        action = action_tensor.detach().numpy()

        # Execute action and observe reward & next state
        next_state, reward, done, _ = env.step(action)
        steps += 1
        reward_total += reward

        # Compute TD target
        next_state_tensor = torch.tensor(scale_state(next_state), dtype=torch.float32)
        V_next_state = value_net(next_state_tensor)
        target = reward + gamma * V_next_state

        # Compute TD error
        V_state = value_net(state_tensor)
        td_error = target - V_state

        # Update actor (policy) network
        loss_actor = -norm_dist.log_prob(action_tensor) * td_error.detach()
        optimizer_actor.zero_grad()
        loss_actor.mean().backward()
        optimizer_actor.step()

        # Update critic (value) network
        loss_critic = nn.functional.mse_loss(V_state, target.detach())
        optimizer_critic.zero_grad()
        loss_critic.backward()
        optimizer_critic.step()

        state = next_state

    episode_history.append(reward_total)
    print("Episode: {}, Number of Steps: {}, Cumulative reward: {:.2f}".format(
        episode, steps, reward_total))

    # Check for solving criteria
    if np.mean(episode_history[-100:]) > 90 and len(episode_history) >= 101:
        print("****************Solved***************")
        print("Mean cumulative reward over 100 episodes: {:.2f}".format(
            np.mean(episode_history[-100:])))


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (1, 2) + inhomogeneous part.

In [3]:
import tensorflow as tf
import numpy as np
import gym  #requires OpenAI gym installed
env = gym.envs.make("MountainCarContinuous-v0") 

tf.compat.v1.reset_default_graph()

input_dims = 2
state_placeholder = tf.placeholder(tf.float32, [None, input_dims]) 

def value_function(state):
    n_hidden1 = 400  
    n_hidden2 = 400
    n_outputs = 1
    
    with tf.variable_scope("value_network"):
        init_xavier = tf.contrib.layers.xavier_initializer()
        
        hidden1 = tf.layers.dense(state, n_hidden1, tf.nn.elu, init_xavier)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, tf.nn.elu, init_xavier) 
        V = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
    return V


def policy_network(state):
    n_hidden1 = 40
    n_hidden2 = 40
    n_outputs = 1
    
    with tf.variable_scope("policy_network"):
        init_xavier = tf.contrib.layers.xavier_initializer()
        
        hidden1 = tf.layers.dense(state, n_hidden1, tf.nn.elu, init_xavier)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, tf.nn.elu, init_xavier)
        mu = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
        sigma = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
        sigma = tf.nn.softplus(sigma) + 1e-5
        norm_dist = tf.contrib.distributions.Normal(mu, sigma)
        action_tf_var = tf.squeeze(norm_dist.sample(1), axis=0)
        action_tf_var = tf.clip_by_value(
            action_tf_var, env.action_space.low[0], 
            env.action_space.high[0])
    return action_tf_var, norm_dist

################################################################
#sample from state space for state normalization
import sklearn
import sklearn.preprocessing
                                    
state_space_samples = np.array(
    [env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)

#function to normalize states
def scale_state(state):                 #requires input shape=(2,)
    scaled = scaler.transform([state])
    return scaled                       #returns shape =(1,2)   
###################################################################

lr_actor = 0.00002  #set learning rates
lr_critic = 0.001

# define required placeholders
action_placeholder = tf.placeholder(tf.float32)
delta_placeholder = tf.placeholder(tf.float32)
target_placeholder = tf.placeholder(tf.float32)

action_tf_var, norm_dist = policy_network(state_placeholder)
V = value_function(state_placeholder)

# define actor (policy) loss function
loss_actor = -tf.log(norm_dist.prob(action_placeholder) + 1e-5) * delta_placeholder
training_op_actor = tf.train.AdamOptimizer(
    lr_actor, name='actor_optimizer').minimize(loss_actor)

# define critic (state-value) loss function
loss_critic = tf.reduce_mean(tf.squared_difference(
                             tf.squeeze(V), target_placeholder))
training_op_critic = tf.train.AdamOptimizer(
        lr_critic, name='critic_optimizer').minimize(loss_critic)
################################################################
#Training loop
gamma = 0.99        #discount factor
num_episodes = 300

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    episode_history = []
    for episode in range(num_episodes):
        #receive initial state from E
        state = env.reset()   # state.shape -> (2,)
        reward_total = 0 
        steps = 0
        done = False
        while (not done):
                
            #Sample action according to current policy
            #action.shape = (1,1)
            action  = sess.run(action_tf_var, feed_dict={
                          state_placeholder: scale_state(state)})
            #Execute action and observe reward & next state from E
            # next_state shape=(2,)    
            #env.step() requires input shape = (1,)
            next_state, reward, done, _ = env.step(
                                    np.squeeze(action, axis=0)) 
            steps +=1
            reward_total += reward
            #V_of_next_state.shape=(1,1)
            V_of_next_state = sess.run(V, feed_dict = 
                    {state_placeholder: scale_state(next_state)})  
            #Set TD Target
            #target = r + gamma * V(next_state)     
            target = reward + gamma * np.squeeze(V_of_next_state) 
            
            # td_error = target - V(s)
            #needed to feed delta_placeholder in actor training
            td_error = target - np.squeeze(sess.run(V, feed_dict = 
                        {state_placeholder: scale_state(state)})) 
            
            #Update actor by minimizing loss (Actor training)
            _, loss_actor_val  = sess.run(
                [training_op_actor, loss_actor], 
                feed_dict={action_placeholder: np.squeeze(action), 
                state_placeholder: scale_state(state), 
                delta_placeholder: td_error})
            #Update critic by minimizinf loss  (Critic training)
            _, loss_critic_val  = sess.run(
                [training_op_critic, loss_critic], 
                feed_dict={state_placeholder: scale_state(state), 
                target_placeholder: target})
            
            state = next_state
            #end while
        episode_history.append(reward_total)
        print("Episode: {}, Number of Steps : {}, Cumulative reward: {:0.2f}".format(
            episode, steps, reward_total))
        
        if np.mean(episode_history[-100:]) > 90 and len(episode_history) >= 101:
            print("****************Solved***************")
            print("Mean cumulative reward over 100 episodes:{:0.2f}" .format(
                np.mean(episode_history[-100:])))
            
            

AttributeError: module 'tensorflow' has no attribute 'placeholder'

In [4]:
import numpy as np
import gym
import matplotlib.pyplot as plt

# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()

# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    # Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []
    
    # Calculate episodic reduction in epsilon
    reduction = (epsilon - min_eps)/episodes
    
    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
    
        while done != True:   
            # Render environment for last five episodes
            if i >= (episodes - 20):
                env.render()
                
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done, info = env.step(action) 
            
            # Discretize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                                     
            # Update variables
            tot_reward += reward
            state_adj = state2_adj
        
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
        
        # Track rewards
        reward_list.append(tot_reward)
        
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
            
    
    
    return ave_reward_list