In [31]:
import numpy as np
import plotly.graph_objects as go
import random

def generate_plot():
    # Set random seeds for reproducibility
    seed_value = 20
    random.seed(seed_value)
    np.random.seed(seed_value)

    # Set the size of the terrain
    size = 200  # Size of the grid
    scale = 1  # Scale for general hill variations

    # Create a 2D grid of coordinates
    x = np.linspace(0, size, size)
    y = np.linspace(0, size, size)
    x, y = np.meshgrid(x, y)

    # Initialize the terrain with random small hills
    z = np.zeros((size, size))
    num_hills = 60  # Number of random hills
    min_hill_height = 0.5
    max_hill_height = 1
    min_hill_width = 5
    max_hill_width = 15
    prominent_hill_height = 5
    prominent_hill_width = random.uniform(8, 12)

    for _ in range(num_hills):
        # Randomize hill parameters
        hill_height = random.uniform(min_hill_height, max_hill_height)  # Heights between 1 and 2
        hill_x = random.uniform(0, size)
        hill_y = random.uniform(0, size)
        hill_width = random.uniform(min_hill_width, max_hill_width)  # Controls hill spread
        
        # Add Gaussian hill to the terrain
        z += hill_height * np.exp(-((x - hill_x) ** 2 + (y - hill_y) ** 2) / (2 * hill_width ** 2))

    # Add one prominent hill at height close to 10
    prominent_hill_x = random.uniform(0, size )
    prominent_hill_y = random.uniform(0, size)

    z += prominent_hill_height * np.exp(-((x - prominent_hill_x) ** 2 + (y - prominent_hill_y) ** 2) / (2 * prominent_hill_width ** 2))

    # Create a 3D surface plot
    fig = go.Figure(data=[go.Surface(z=z, x=x, y=y, colorscale="Viridis")])

    # Set plot title and axis labels
    fig.update_layout(
        title="Randomized Hill Terrain with One Prominent Hill",
        scene=dict(
            xaxis_title="X Axis",
            yaxis_title="Y Axis",
            zaxis_title="Height",
            zaxis=dict(range=[0, 10])
        ),
        autosize=False,
        width=800,
        height=800,
        margin=dict(l=65, r=50, b=65, t=90),
    )

    return fig, x, y, z, prominent_hill_x, prominent_hill_y

terrain_fig, x, y, z, prominent_hill_x, prominent_hill_y = generate_plot()
terrain_fig.show()
print(prominent_hill_x, prominent_hill_y)

91.11859864448355 137.0969428108047


In [32]:
from terrain_helpers import discretize_terrain

# Define the grid resolution for x, y, and the number of elevation bins for z
xy_resolution = 5
num_z_bins = 20

# Call the discretize_terrain function with your terrain data (x, y, z)
state_map, z_bins, inverse_map = discretize_terrain(x, y, z, xy_resolution=xy_resolution, num_z_bins=num_z_bins)

# Print the shape of the state_map and the bin edges for verification
print(f"State map shape: {state_map.shape}")
print(f"Inverse map shape: {inverse_map.shape}")
print(f"Z bin edges: {z_bins}")



[[0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
State map shape: (200, 200, 3)
Inverse map shape: (200, 200, 3)
Z bin edges: [2.49965458e-03 2.52253807e-01 5.02007960e-01 7.51762113e-01
 1.00151627e+00 1.25127042e+00 1.50102457e+00 1.75077872e+00
 2.00053288e+00 2.25028703e+00 2.50004118e+00 2.74979533e+00
 2.99954949e+00 3.24930364e+00 3.49905779e+00 3.74881195e+00
 3.99856610e+00 4.24832025e+00 4.49807440e+00 4.74782856e+00
 4.99758271e+00]


In [33]:
from OnlinePlanning import MonteCarloTreeSearch
from collections import deque

goal_x, goal_y = prominent_hill_x, prominent_hill_y
goal_radius = 5
goal_reward = 10000

recent_states = deque(maxlen=10)

def get_reward(s, s_prime, steps=0, exploration_phase=False):
    """
    Computes the reward for a given transition using the inverse_map for elevation,
    with a penalty for the number of steps taken.

    Parameters:
    - s: Current state (x_idx, y_idx).
    - s_prime: Next state (x_idx, y_idx).
    - steps: Current number of steps taken (used for time-decayed rewards).
    - exploration_phase: Boolean indicating whether the exploration phase is active.

    Returns:
    - reward: Computed reward for the transition.
    """
    
    # Check for invalid transition
    #if s_prime is None:
     #   return -1e4  # Large negative penalty for invalid transitions
    
    
    # Extract current and next state's x, y indices
    x_idx, y_idx = s
    x_prime_idx, y_prime_idx = s_prime

    # Get real-world coordinates from inverse_map
    x_curr, y_curr, z_curr = inverse_map[x_idx, y_idx]
    x_next, y_next, z_next = inverse_map[x_prime_idx, y_prime_idx]

    # Elevation change
    elevation_change = z_next - z_curr

    # Compute distance to the goal for current and next states
    current_distance_to_goal = np.sqrt((x_curr - goal_x)**2 + (y_curr - goal_y)**2)
    next_distance_to_goal = np.sqrt((x_next - goal_x)**2 + (y_next - goal_y)**2)

    # Check if the next state is within the goal region
    if next_distance_to_goal <= goal_radius:
        return goal_reward

    # Potential-based shaping: reward is based on moving closer to the goal
    potential_difference = current_distance_to_goal - next_distance_to_goal

    # Time-decayed reward using steps
    time_penalty = steps * 0.01  # Adjust this factor based on problem dynamics

    # Exploration phase bonus
    exploration_bonus = 5 if exploration_phase and next_distance_to_goal < current_distance_to_goal else 0

    elevation_reward = -0.1 * abs(elevation_change)
    
    oscillation_penalty = -100 if s_prime in recent_states else 0
    

    # Final reward
    
    reward = (
        10 * potential_difference
        + elevation_reward
        - time_penalty
        + exploration_bonus
        + oscillation_penalty
    )
    
    recent_states.append(s_prime)
    
    
    return reward


def deterministic_transition(s, a):
    """
    Determines the next state based on a deterministic action model with boundary handling.
    """
    
    if s is None:  # If the current state is invalid, propagate None
        return None
    
    x, y = s
    dx, dy = a
    new_x, new_y = x + dx, y + dy

    # Check if the new state is out of bounds
    if not (0 <= new_x < state_map.shape[0] and 0 <= new_y < state_map.shape[1]):
        return None  # Indicate an invalid transition
    return (new_x, new_y)

def gaussian_random_integer(low=-2, high=2, mean=0, std=1):
    """
    Returns an integer between -2 and 2 that has a mean of 0 and std of 1 to simulate stochastic transition
    """
    return int(np.clip(round(np.random.normal(mean, std)), low, high))

def stochastic_transition(s, a):
    """
    Determines the next state based on a stochastic action model with boundary handling.
    """
    x, y = s
    dx, dy = a
    
    dx = dx + gaussian_random_integer()
    dy = dy + gaussian_random_integer()
    new_x, new_y = x + dx, y + dy

    # Check if the new state is out of bounds
    if not (0 <= new_x < state_map.shape[0] and 0 <= new_y < state_map.shape[1]):
        return None  # Indicate an invalid transition
    return (new_x, new_y)

# Define the custom TR function using deterministic_transition and get_reward
def custom_TR(s, a, steps):
    """
    Transition function that determines the next state and reward.
    
    Parameters:
    - s: Current state.
    - a: Action taken.
    - steps: Current number of steps in the trajectory (used for time-decayed rewards).

    Returns:
    - next_state: The next state after taking action a from state s.
    - reward: The reward for the transition.
    """
    next_state = deterministic_transition(s, a)
    if next_state is None:  # If the action leads out of bounds
        return None, -1e4  # Return the current state with a default negative reward
    reward = get_reward(s, next_state, steps=steps)
    return next_state, reward



In [34]:
from MDP import MDP

# Set the discount factor
gamma = 1

# Define the state space (S) using the discretized terrain (state_map)
S = [(i, j) for i in range(state_map.shape[0]) for j in range(state_map.shape[1])]

# Define the action space (A) as discrete steps in x and y directions (with max step size of 5)
max_step = 5
A = [(dx, dy) for dx in range(-max_step, max_step + 1) for dy in range(-max_step, max_step + 1) if dx != 0 or dy != 0]

T = deterministic_transition

R = get_reward

TR = custom_TR

# Instantiate the MDP using the MDP class you already have
hill_climb_mdp = MDP(
    gamma=gamma,
    S=S,
    A=A,
    T=T,
    R=R,
    TR=TR
)

# Print the number of states and actions to verify the MDP instantiation
print(f"Number of states: {len(S)}")
print(f"Number of actions: {len(A)}")

Number of states: 40000
Number of actions: 120


In [35]:
# Initialize visit counts (N) and action value estimates (Q)
N = {}
Q = {}
for s in hill_climb_mdp.S:
    for a in hill_climb_mdp.A:
        N[(s, a)] = 0
        Q[(s, a)] = 0.0

# Define MCTS parameters
d = 10     # Depth of the search tree
m = 100    # Number of simulations per action
c = 1.0    # Exploration constant

# Define a simple value function estimate (U) as a heuristic
def U(s):
    """
    Utility function for MCTS using the Bellman equation with the custom reward function.
    """
    best_value = -np.inf
    for a in hill_climb_mdp.A:
        s_prime = deterministic_transition(s, a)
        if s_prime is None:
            value = -1e4  # Penalize invalid transitions
        else:
            # Calculate the reward using the custom get_reward function
            r = get_reward(s, s_prime)
            # Bellman update: U(s) = max_a (r + gamma * U(s'))
            value = r + hill_climb_mdp.gamma * (Q.get((s_prime, a), 0.0))
        best_value = max(best_value, value)

    return best_value if best_value > -np.inf else -1e6  # Return 0 if no valid actions

# Instantiate the MCTS planner
mcts = MonteCarloTreeSearch(
    P=hill_climb_mdp,
    N=N,
    Q=Q,
    d=d,
    m=m,
    c=c,
    U=U
)



In [36]:
def execute_mcts_policy(initial_state, mcts, goal_radius, inverse_map, max_steps=200):
    """
    Executes a full policy using MCTS in an online planning loop.
    Tracks trajectory, actions, steps, total reward, and metrics for excess distance and elevation.
    """
    state = initial_state
    trajectory = [state]
    actions = []
    total_reward = 0  # Track cumulative reward
    steps = 0         # Track the number of steps
    path_length = 0.0  # Total distance traveled in x-y plane
    elevation_change = 0.0  # Total elevation change

    # Get initial and goal coordinates
    x_start_idx, y_start_idx = state
    x_start, y_start, z_start = inverse_map[x_start_idx, y_start_idx]
    original_distance = np.sqrt((x_start - goal_x)**2 + (y_start - goal_y)**2)
    original_elevation = goal_elevation - z_start  # goal_elevation is z at goal

    # Track minimum elevation encountered
    min_elevation = z_start

    for _ in range(max_steps):
        # Get real-world coordinates for the current state
        x_idx, y_idx = state
        x_curr, y_curr, z_curr = inverse_map[x_idx, y_idx]

        # Update minimum elevation encountered
        min_elevation = min(min_elevation, z_curr)

        # Check if the robot has reached the goal region
        if np.sqrt((x_curr - goal_x)**2 + (y_curr - goal_y)**2) <= goal_radius:
            print(f"Reached the goal in {steps} steps with cumulative reward {total_reward:.2f}!")
            break

        # Use MCTS to select the best action from the current state
        best_action = mcts(state)
        actions.append(best_action)

        # Determine the next state and reward using the MDP's TR function
        next_state, reward = hill_climb_mdp.TR(state, best_action, steps)
        total_reward += reward
        steps += 1

        # If the transition was invalid (e.g., out of bounds), terminate the loop
        if next_state is None:
            print("Reached boundary, stopping exploration.")
            break

        # Get real-world coordinates for the next state
        x_next_idx, y_next_idx = next_state
        x_next, y_next, z_next = inverse_map[x_next_idx, y_next_idx]

        # Update path length (Euclidean distance in x-y plane)
        path_length += np.sqrt((x_next - x_curr)**2 + (y_next - y_curr)**2)

        # Update elevation change (only count increases over the minimum elevation)
        if z_next > min_elevation:
            elevation_change += z_next - z_curr

        # Update the current state and record trajectory
        state = next_state
        trajectory.append(state)

    # Calculate excess metrics
    excess_distance = path_length - original_distance
    excess_elevation = elevation_change - original_elevation

    return (trajectory, actions, steps, total_reward, path_length, elevation_change,
            excess_distance, excess_elevation)


# Example usage
initial_state = 20, 160
goal_elevation = 5  # taken from prominent_hill_height in get_terrain
trajectory, actions, steps, total_reward, path_length, elevation_change, excess_distance, excess_elevation = execute_mcts_policy(
    initial_state, mcts, goal_radius, inverse_map, max_steps=200
)

print(initial_state)
print(f"Trajectory: {trajectory}")
print(f"Actions: {actions}")
print(f"Steps Taken: {steps}")
print(f"Cumulative Reward: {total_reward:.2f}")
print(f"Path Length: {path_length:.2f}")
print(f"Total Elevation Change: {elevation_change:.2f}")
print(f"Excess Distance: {excess_distance:.2f}")
print(f"Excess Elevation Change: {excess_elevation:.2f}")


(20, 160)
Trajectory: [(20, 160), (24, 157), (28, 154), (32, 151), (36, 148), (40, 145), (44, 142), (48, 139), (52, 136), (56, 133), (60, 130), (64, 127), (68, 124), (72, 121), (76, 118), (80, 115), (84, 112), (88, 109), (92, 106), (96, 103), (100, 100), (104, 97), (108, 94), (112, 91), (116, 88), (120, 85), (124, 82), (128, 79), (132, 76), (136, 73), (140, 70), (144, 67), (148, 64), (152, 61), (156, 58), (160, 55), (164, 52), (168, 49), (172, 46), (170, 51), (172, 56), (169, 57), (170, 53), (171, 56), (169, 57), (174, 56), (170, 51), (175, 55), (177, 59), (175, 64), (170, 59), (165, 62), (160, 61), (155, 63), (150, 63), (145, 64), (141, 62), (136, 67), (131, 69), (126, 71), (131, 67), (136, 67), (141, 70), (144, 67), (149, 65), (145, 70), (140, 70), (145, 75), (150, 74), (153, 69), (152, 70), (155, 70), (158, 73), (162, 70), (157, 70), (152, 68), (148, 66), (150, 63), (152, 61), (156, 66), (160, 68), (155, 68), (152, 73), (150, 69), (148, 65), (150, 65), (153, 63), (154, 67), (153, 69

In [37]:
trajectory_xyz = []
# Given trajectory from earlier (indices and values)
for x_idx, y_idx in trajectory:
    trajectory_x, trajectory_y, trajectory_z = inverse_map[x_idx, y_idx]
    trajectory_xyz.append([trajectory_x, trajectory_y, trajectory_z])
    
print(trajectory_xyz)

[[160.0, 20.0, 1.2512704182309122], [155.0, 20.0, 1.2512704182309122], [150.0, 25.0, 1.0015162654998193], [150.0, 30.0, 0.7517621127687265], [145.0, 35.0, 0.5020079600376337], [145.0, 40.0, 0.5020079600376337], [140.0, 40.0, 0.7517621127687265], [135.0, 45.0, 1.0015162654998193], [135.0, 50.0, 1.2512704182309122], [130.0, 55.0, 1.5010245709620051], [130.0, 60.0, 1.7507787236930978], [125.0, 60.0, 1.2512704182309122], [120.0, 65.0, 1.0015162654998193], [120.0, 70.0, 0.7517621127687265], [115.0, 75.0, 0.7517621127687265], [115.0, 80.0, 0.7517621127687265], [110.0, 80.0, 1.0015162654998193], [105.0, 85.0, 1.0015162654998193], [105.0, 90.0, 1.0015162654998193], [100.0, 95.0, 0.7517621127687265], [100.0, 100.0, 0.5020079600376337], [95.0, 100.0, 0.5020079600376337], [90.0, 105.0, 0.25225380730654084], [90.0, 110.0, 0.5020079600376337], [85.0, 115.0, 1.0015162654998193], [85.0, 120.0, 1.5010245709620051], [80.0, 120.0, 2.0005328764241908], [75.0, 125.0, 2.2502870291552832], [75.0, 130.0, 2.0

In [38]:
# Extract x, y, z values from trajectory
trajectory_x = [point[0] for point in trajectory_xyz]
trajectory_y = [point[1] for point in trajectory_xyz]
trajectory_z = [point[2] + 0.5  for point in trajectory_xyz] # add dithering to show on graph

terrain_fig, x, y, z, prominent_hill_x, prominent_hill_y = generate_plot()

# Add the trajectory as a 3D line
terrain_fig.add_trace(go.Scatter3d(
    x=trajectory_x,
    y=trajectory_y,
    z=trajectory_z,
    mode='lines+markers',
    line=dict(color='red', width=4),
    marker=dict(size=0, color='red'),
    name = 'Final Trajectory'
))

# Add the initial state with a blue marker
terrain_fig.add_trace(go.Scatter3d(
    x=[trajectory_x[0]],
    y=[trajectory_y[0]],
    z=[trajectory_z[0]],
    mode='markers',
    marker=dict(size=8, color='blue', symbol='circle'),
    name="Initial State"
))


# Set plot title and axis labels
terrain_fig.update_layout(
    title="Randomized Hill Terrain with Trajectory",
    scene=dict(
        xaxis_title="X Axis",
        yaxis_title="Y Axis",
        zaxis_title="Height",
        zaxis=dict(range=[0, 10])
    ),
    legend=dict(
        x=0.05,  # Move the legend further to the right
        y=1,     # Adjust the vertical position
        traceorder="normal",
        title="Legend",
        font=dict(
            family="Arial",
            size=12,
            color="black"
        ),
        bgcolor="rgba(255, 255, 255, 0.7)",
        bordercolor="Black",
        borderwidth=1
    ),
    autosize=False,
    width=800,
    height=800,
    margin=dict(l=65, r=50, b=65, t=90),
)

# Show the plot
terrain_fig.show()


In [39]:
print(goal_x, goal_y)

91.11859864448355 137.0969428108047
