In [None]:
import numpy as np

from src.rl_agent import RLAgent
from src.state_representation import StateRepresentation

from src.env_bandit import EnvironmentBandit
from src.env_parkingworld import ParkingWorld
from src.env_cliffwalk import EnvironmentCliffWalk
from src.env_maze import EnvironmentMaze
from src.env_randomwalk import EnvironmentRandomWalk
from src.env_mountaincar import EnvironmentMountainCar
from src.env_pendulum import EnvironmentPendulum
from src.env_lunarlanding import EnvironmentLunarLanding

from src.rl_experiments import RLExperiments

## Bandits

In [None]:
# define environment
n_arms = 10
env = EnvironmentBandit(n_arms=n_arms) 

# define agent
actions = list(np.arange(1, n_arms + 1))
states = None
policy = None
agent = RLAgent(agent_type='bandit', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular')

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.bandit(agent, env,
                                num_episodes=200,
                                num_steps=1000,
                                step_size={'value': None},
                                epsilon=0.01)

## Dynamic Programming - Policy Evaluation

In [None]:
# define environment
num_spaces = 10
num_prices = 4
env = ParkingWorld(num_spaces, num_prices) 

# define agent
actions = env.actions

states = env.states

policy = {}
for state in states:
    policy[state] = {}
    counter = 0
    for a in actions:
        if counter == 1:
            policy[state][a] = 1
        else:
            policy[state][a] = 0
            
        counter += 1 
        
agent = RLAgent(agent_type='dp', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular')

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.policy_evaluation(agent, env,
                                       discount=0.9, 
                                       theta=0.1)

## Dynamic Programming - Policy Iteration

In [None]:
# define environment
num_spaces = 10
num_prices = 4
env = ParkingWorld(num_spaces, num_prices) 

# define agent
actions = env.actions

states = env.states

policy = {}
for state in states:
    policy[state] = {}
    for a in actions:
        policy[state][a] = 1 / len(actions) 
        
agent = RLAgent(agent_type='dp', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular')

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.policy_iteration(agent, env,
                                      discount=0.9, 
                                      theta=0.1)

## Dynamic Programming - Value Iteration

In [None]:
# define environment
num_spaces = 10
num_prices = 4
env = ParkingWorld(num_spaces, num_prices) 

# define agent
actions = env.actions

states = env.states

policy = {}
for state in states:
    policy[state] = {}
    for a in actions:
        policy[state][a] = 1 / len(actions) 
        
agent = RLAgent(agent_type='dp', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular')

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.value_iteration(agent, env,
                                     discount=0.9, 
                                     theta=0.1)

## TD

In [None]:
# define environment
grid_height = 4
grid_width = 12
env = EnvironmentCliffWalk(grid_height, grid_width) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []

policy = {}
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        policy[state] = {}
        for a in actions:
            policy[state][a] = 1 / len(actions)
        states.append(state)        
policy[36] = {'up': 0.9, 'left': 0.1/3, 'down': 0.1/3, 'right': 0.1/3}
for i in range(24, 35):
    policy[i] = {'up': 0.1/3, 'left': 0.1/3, 'down': 0.1/3, 'right': 0.9}
policy[35] = {'up': 0.1/3, 'left': 0.1/3, 'down': 0.9, 'right': 0.1/3}
        
agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.td_lambda(agent, env,
                                   num_episodes=10000,
                                   step_size={'value': 0.1},
                                   discount=0.9)

## SARSA

In [None]:
# define environment
grid_height = 4
grid_width = 12
env = EnvironmentCliffWalk(grid_height, grid_width) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None
        
agent = RLAgent(agent_type='sarsa', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.q_learning_sarsa(agent, env,
                                          num_runs=100,
                                          num_episodes=1000,
                                          epsilon=0.1,
                                          step_size={'value': 0.5},
                                          discount=1.0)

## Q-Learning

In [None]:
# define environment
grid_height = 4
grid_width = 12
env = EnvironmentCliffWalk(grid_height, grid_width) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None

agent = RLAgent(agent_type='q_learning', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.q_learning_sarsa(agent, env,
                                          num_runs=100,
                                          num_episodes=1000,
                                          epsilon=0.1,
                                          step_size={'value': 0.5},
                                          discount=1.0)

## Expected SARSA

In [None]:
# define environment
grid_height = 4
grid_width = 12
env = EnvironmentCliffWalk(grid_height, grid_width) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None
        
agent = RLAgent(agent_type='expected_sarsa', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular')

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.q_learning_sarsa(agent, env,
                                          num_runs=100,
                                          num_episodes=1000,
                                          epsilon=0.1,
                                          step_size={'value': 0.5},
                                          discount=1.0)

## Planning - No Planning vs. Planning Number of Steps

In [None]:
# define environment
grid_height = 6
grid_width = 9
env = EnvironmentMaze(grid_height, grid_width) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None
        
agent = RLAgent(agent_type='q_learning', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                planning_type='dyna',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
results, fig = rl_experiments.dyna_q_planning_num_steps(agent, env,
                                                        num_runs=30,
                                                        num_episodes=40,
                                                        epsilon=0.1,
                                                        step_size={'value': 0.125},
                                                        discount=0.95,
                                                        planning_steps_list=[0, 5, 50])

## Planning - Dyna-Q Planning

In [None]:
# define environment
grid_height = 6
grid_width = 9
env = EnvironmentMaze(grid_height, grid_width) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None

agent = RLAgent(agent_type='q_learning', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                planning_type='dyna',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
results, fig = rl_experiments.dyna_q_planning_state_visits(agent, env,
                                                           num_runs=5,
                                                           num_episodes=500,
                                                           epsilon=0.1,
                                                           step_size={'value': 0.125},
                                                           discount=0.95,
                                                           planning_steps=30)

## Planning - Dyna-Q vs. Dyna-Q+ in Changing Environment

In [None]:
# define environment
grid_height = 6
grid_width = 9
env = EnvironmentMaze(grid_height, grid_width, obstacle_switch_time=1000) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None
        
agent = RLAgent(agent_type='q_learning', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                planning_type='dyna',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.dyna_q_planning_state_visits(agent, env,
                                                      num_runs=5,
                                                      num_episodes=500,
                                                      epsilon=0.1,
                                                      step_size={'value': 0.125},
                                                      discount=0.95,
                                                      planning_steps=30,
                                                      obstacle_switch=True)

In [None]:
# define environment
grid_height = 6
grid_width = 9
env = EnvironmentMaze(grid_height, grid_width, obstacle_switch_time=1000) 

# define agent
actions = ['up', 'left', 'down', 'right']

states = []
for h in range(grid_height):
    for w in range(grid_width):
        location = (w, h)
        state = env.get_state(location)
        states.append(state)        

policy = None
        
agent = RLAgent(agent_type='q_learning', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='tabular',
                planning_type='dyna_plus',
                planning_kappa=0.01,
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.dyna_q_planning_state_visits(agent, env,
                                                      num_runs=5,
                                                      num_episodes=500,
                                                      epsilon=0.1,
                                                      step_size={'value': 0.125},
                                                      discount=0.95,
                                                      planning_steps=30,
                                                      obstacle_switch=True)

## Semi-Gradient TD with State Aggregation

In [None]:
# define environment
num_states = 500
start_state = 250
left_terminal_state = 1
right_terminal_state = 500
max_movement = 100
env = EnvironmentRandomWalk(num_states, start_state, left_terminal_state, right_terminal_state, max_movement) 

# define agent with function approximation
num_groups = 10
num_states_in_group = int(num_states / num_groups)
state_representation = StateRepresentation(num_states_in_group=num_states_in_group,
                                           num_groups=num_groups)

actions = ['left', 'right']
states = np.zeros((num_groups, 1))
policy = None

agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='linear',
                value_update_type='stochastic_gradient_descent',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.td_semigradient(agent, env, 
                                     state_representation=state_representation.state_aggregation,
                                     num_runs=10,
                                     num_episodes=2000,
                                     epsilon = 1.0,
                                     step_size={'value': 0.05},
                                     discount=1.0)

## Semi-Gradient TD with Neural Network (Stochastic Gradient Descent Update)

In [None]:
# define environment
num_states = 50
start_state = 25
left_terminal_state = 1
right_terminal_state= 50
max_movement = 10
env = EnvironmentRandomWalk(num_states, start_state, left_terminal_state, right_terminal_state, max_movement) 

# define agent with function approximation
state_representation = StateRepresentation(num_states=num_states)

actions = ['left', 'right']
states = np.zeros((num_states, 1))
policy = None

value_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 10,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': 1,
            'activation_function': 'linear',
        },
    }
}
            
agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='neural_network', 
                value_nn_config=value_nn_config,
                value_update_type='stochastic_gradient_descent',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.td_semigradient(agent, env, 
                                     state_representation=state_representation.one_hot_encode,
                                     num_runs=10,
                                     num_episodes=2000,
                                     epsilon=1.0,
                                     step_size={'value': 0.02},
                                     discount=1.0)

## Semi-Gradient TD with Neural Network (Adam Update)

In [None]:
# define environment
num_states = 50
start_state = 25
left_terminal_state = 1
right_terminal_state= 50
max_movement = 10
env = EnvironmentRandomWalk(num_states, start_state, left_terminal_state, right_terminal_state, max_movement) 

# define agent with function approximation
state_representation = StateRepresentation(num_states=num_states)

actions = ['left', 'right']
states = np.zeros((num_states, 1))
policy = None

value_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 10,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': 1,
            'activation_function': 'linear',
        },
    }
}
            
agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='neural_network', 
                value_nn_config=value_nn_config,
                value_update_type='adam', 
                beta_m_adam=0.9, 
                beta_v_adam=0.999, 
                epsilon_adam=1e-8,
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.td_semigradient(agent, env, 
                                     state_representation=state_representation.one_hot_encode,
                                     num_runs=10,
                                     num_episodes=2000,
                                     epsilon=1.0,
                                     step_size={'value': 0.001},
                                     discount=1.0)

## Semi-Gradient TD with Neural Network (Adam Update) - PyTorch

In [None]:
# define environment
num_states = 50
start_state = 25
left_terminal_state = 1
right_terminal_state= 50
max_movement = 10
env = EnvironmentRandomWalk(num_states, start_state, left_terminal_state, right_terminal_state, max_movement) 

# define agent with function approximation
state_representation = StateRepresentation(num_states=num_states)

actions = ['left', 'right']
states = np.zeros((num_states, 1))
policy = None

value_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 10,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': 1,
            'activation_function': 'linear',
        },
    }
}
            
agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='nn_pytorch', 
                value_nn_config=value_nn_config,
                value_update_type='adam', 
                beta_m_adam=0.9, 
                beta_v_adam=0.999, 
                epsilon_adam=1e-8,
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.td_semigradient(agent, env, 
                                     state_representation=state_representation.one_hot_encode,
                                     num_runs=10,
                                     num_episodes=2000,
                                     epsilon=1.0,
                                     step_size={'value': 0.001},
                                     discount=1.0)

## Semi-Gradient TD with Neural Network (Adam Update) - Keras

In [None]:
# define environment
num_states = 50
start_state = 25
left_terminal_state = 1
right_terminal_state= 50
max_movement = 10
env = EnvironmentRandomWalk(num_states, start_state, left_terminal_state, right_terminal_state, max_movement) 

# define agent with function approximation
state_representation = StateRepresentation(num_states=num_states)

actions = ['left', 'right']
states = np.zeros((num_states, 1))
policy = None

value_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 10,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': 1,
            'activation_function': 'linear',
        },
    }
}
            
agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='nn_keras', 
                value_nn_config=value_nn_config,
                value_update_type='adam', 
                beta_m_adam=0.9, 
                beta_v_adam=0.999, 
                epsilon_adam=1e-8,
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
fig = rl_experiments.td_semigradient(agent, env, 
                                     state_representation=state_representation.one_hot_encode,
                                     num_runs=1,
                                     num_episodes=300,
                                     epsilon=1.0,
                                     step_size={'value': 0.001},
                                     discount=1.0)

## Semi-Gradient SARSA with Tile Coding

In [None]:
# define environment
env = EnvironmentMountainCar() 

# define agent with function approximation
num_tiles = 8
num_tilings = 8
iht_size = 4096
state_representation = StateRepresentation(num_tiles=num_tiles,
                                           num_tilings=num_tilings,
                                           iht_size=iht_size,
                                           min_pose = env.min_position,
                                           max_pose = env.max_position,
                                           min_vel = -1*env.min_max_velocity,
                                           max_vel = env.min_max_velocity,
                                          )

actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']
states = np.zeros((iht_size, 1))
policy = None

agent = RLAgent(agent_type='sarsa', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='tabular',
                value_type='linear',
                value_update_type='stochastic_gradient_descent',
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.sarsa_semigradient(agent, env, 
                                            state_representation=state_representation.dynamicstate_tilecoding,
                                            num_runs=3,
                                            num_episodes=100,
                                            epsilon=0.1,
                                            step_size={'value': 0.1 / num_tilings},
                                            discount=1.0)

## TD Actor-Critic with Eligibility Traces (Linear Policy)

In [None]:
# define environment
env = EnvironmentPendulum() 

# define agent with function approximation
num_tiles = 8
num_tilings = 32
iht_size = 4096
state_representation = StateRepresentation(num_tiles=num_tiles,
                                           num_tilings=num_tilings,
                                           iht_size=iht_size,
                                           min_pose = -1*np.pi,
                                           max_pose = np.pi,
                                           min_vel = -1*env.min_max_velocity,
                                           max_vel = env.min_max_velocity,
                                          )

actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']
states = np.zeros((iht_size, 1))
policy = None

agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                use_average_reward=True,
                policy_type='linear',
                policy_softmax_tau = 1.0,
                policy_update_type='stochastic_gradient_descent',
                value_type='linear',
                value_update_type='stochastic_gradient_descent',
                use_value_trace=True, 
                value_trace_lambda=0.75,
                use_policy_trace=True, 
                policy_trace_lambda=0.75)

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 2e-2 / num_tilings,
    'policy': 2e-2 / num_tilings,
    'avg_reward': 2e-6,
}

df, fig = rl_experiments.actor_critic(agent, env, 
                                      state_representation=state_representation.dynamicstate_tilecoding,
                                      num_runs=10,
                                      max_steps=20000,
                                      step_size=step_sizes)

## TD Actor-Critic with Eligibility Traces (Neural Network Policy)

In [None]:
# define environment
env = EnvironmentPendulum() 

# define agent with function approximation
num_tiles = 8
num_tilings = 32
iht_size = 4096
state_representation = StateRepresentation(num_tiles=num_tiles,
                                           num_tilings=num_tilings,
                                           iht_size=iht_size,
                                           min_pose = -1*np.pi,
                                           max_pose = np.pi,
                                           min_vel = -1*env.min_max_velocity,
                                           max_vel = env.min_max_velocity,
                                          )

actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']
states = np.zeros((iht_size, 1))
policy = None

policy_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 100,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': len(actions),
            'activation_function': 'linear',
        },
    }
}

agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                use_average_reward=True,
                policy_type='neural_network',
                policy_nn_config=policy_nn_config,
                policy_softmax_tau = 1.0,
                policy_update_type='adam',
                value_type='linear',
                value_update_type='stochastic_gradient_descent',
                use_value_trace=True, 
                value_trace_lambda=0.75,
                use_policy_trace=True, 
                policy_trace_lambda=0.75)

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 2e-2 / num_tilings,
    'policy': 2e-3 / num_tilings,
    'avg_reward': 2e-6,
}

df, fig = rl_experiments.actor_critic(agent, env, 
                                      state_representation=state_representation.dynamicstate_tilecoding,
                                      num_runs=1,
                                      max_steps=20000,
                                      step_size=step_sizes)

## TD Actor-Critic (Neural Network Policy - PyTorch)

In [None]:
# define environment
env = EnvironmentPendulum() 

# define agent with function approximation
num_tiles = 8
num_tilings = 32
iht_size = 4096
state_representation = StateRepresentation(num_tiles=num_tiles,
                                           num_tilings=num_tilings,
                                           iht_size=iht_size,
                                           min_pose = -1*np.pi,
                                           max_pose = np.pi,
                                           min_vel = -1*env.min_max_velocity,
                                           max_vel = env.min_max_velocity,
                                          )

actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']
states = np.zeros((iht_size, 1))
policy = None

policy_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 100,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': len(actions),
            'activation_function': 'linear',
        },
    }
}

agent = RLAgent(agent_type='td', 
                states=states, 
                actions=actions, 
                policy=policy, 
                use_average_reward=True,
                policy_type='nn_pytorch',
                policy_nn_config=policy_nn_config,
                policy_softmax_tau = 1.0,
                policy_update_type='stochastic_gradient_descent',
                value_type='linear',
                value_update_type='stochastic_gradient_descent',
                use_value_trace=False, 
                value_trace_lambda=0,
                use_policy_trace=False, 
                policy_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 2e-2 / num_tilings,
    'policy': 2e-3 / num_tilings,
    'avg_reward': 2e-6,
}

df, fig = rl_experiments.actor_critic(agent, env, 
                                      state_representation=state_representation.dynamicstate_tilecoding,
                                      num_runs=1,
                                      max_steps=50000,
                                      step_size=step_sizes)

## DQN with Experience Replay

In [None]:
# define environment
env = EnvironmentLunarLanding() 

# define agent function approximation
actions = ['main_thruster', 'left_thruster', 'right_thruster', 'no_thruster']

# state: (velocity_x, velocity_y, angle, position_x, position_y, landing_zone_x, landing_zone_y, fuel)
states = np.zeros((8, 1))

policy = None

value_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 256,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': len(actions),
            'activation_function': 'linear',
        },
    }
}

use_experience_replay = True

agent = RLAgent(agent_type='expected_sarsa', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='softmax', 
                policy_softmax_tau = 0.001, 
                value_type='neural_network', 
                value_nn_config=value_nn_config,
                value_update_type='adam', 
                beta_m_adam=0.9, 
                beta_v_adam=0.999, 
                epsilon_adam=1e-8,
                use_experience_replay=use_experience_replay, 
                replay_buffer_size=50000, 
                replay_buffer_minibatch_size=8,
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.dqn_replay_buffer(agent, env,
                                           num_runs=3,
                                           num_episodes=300,
                                           step_size={'value': 1e-3},
                                           discount=0.99,
                                           experience_replay=use_experience_replay,
                                           experience_replay_steps=4)

## DQN with Experience Replay - PyTorch

In [None]:
# define environment
env = EnvironmentLunarLanding() 

# define agent function approximation
actions = ['main_thruster', 'left_thruster', 'right_thruster', 'no_thruster']

# state: (velocity_x, velocity_y, angle, position_x, position_y, landing_zone_x, landing_zone_y, fuel)
states = np.zeros((8, 1))

policy = None

value_nn_config = {
    'layers': {
        'input': {
            'nodes': len(states),
            'activation_function': 'linear',
        },
        'hidden_1': {
            'nodes': 256,
            'activation_function': 'relu',
        },
        'output': {
            'nodes': len(actions),
            'activation_function': 'linear',
        },
    }
}

use_experience_replay = True

agent = RLAgent(agent_type='expected_sarsa', 
                states=states, 
                actions=actions, 
                policy=policy, 
                policy_type='softmax', 
                policy_softmax_tau = 0.001, 
                value_type='nn_pytorch', 
                value_nn_config=value_nn_config,
                value_update_type='adam', 
                beta_m_adam=0.9, 
                beta_v_adam=0.999, 
                epsilon_adam=1e-8,
                use_experience_replay=use_experience_replay, 
                replay_buffer_size=50000, 
                replay_buffer_minibatch_size=8,
                use_value_trace=False, 
                value_trace_lambda=0)

# run experiment
rl_experiments = RLExperiments()
df, fig = rl_experiments.dqn_replay_buffer(agent, env,
                                           num_runs=3,
                                           num_episodes=300,
                                           step_size={'value': 1e-3},
                                           discount=0.99,
                                           experience_replay=use_experience_replay,
                                           experience_replay_steps=4)