In [1]:
from objects import EnvironmentManager
from helpers import *

env = EnvironmentManager()
env.start_vehicles()
env.route.load_all_pax()

observation, reward, terminated, truncated, info = env.step(action=None)
while not terminated:
    action = get_action('RA')
    observation, reward, terminated, truncated, info = env.step(action=action)

history = env.get_history()
# for key in history:
#     history[key]['scenario'] = scenario
#     history[key]['episode'] = i
#     results[key].append(history[key])

In [1]:
from rl_env import FlexSimEnv
import numpy as np

In [2]:
rl_env  = FlexSimEnv()

In [5]:
rl_env.reset()

({'control_stop_idx': array([1], dtype=int32),
  'n_requests': array([1], dtype=int32),
  'headway': array([666.], dtype=float32),
  'schedule_deviation': array([-18.], dtype=float32)},
 {'skipped_requests': 0,
  'off_schedule_trips': 0,
  'time': 894.0,
  'veh_idx': 1,
  'direction': 'out'})

In [19]:
rl_env.step(action=0)

({'control_stop_idx': array([0], dtype=int32),
  'n_requests': array([3], dtype=int32),
  'headway': array([593.], dtype=float32),
  'schedule_deviation': array([-21.], dtype=float32)},
 -2.0,
 0,
 0,
 {'skipped_requests': 2,
  'off_schedule_trips': 0,
  'time': 5537.0,
  'veh_idx': 1,
  'direction': 'out'})

In [20]:
from params import REWARD_WEIGHTS
REWARD_WEIGHTS

{'skipped_requests': -1.0, 'off_schedule_trips': -1.0}

In [1]:
# train
from training_claude import train_vehicles

agent, scores = train_vehicles(reward_weights={'off_schedule_trips': -4.0, 'skipped_requests': -1.0})

Training with shared policy...
Episode 0/500, Avg Score: -0.48, Epsilon: 1.0000
Episode 20/500, Avg Score: -0.69, Epsilon: 0.8521
Episode 40/500, Avg Score: -0.66, Epsilon: 0.7261
Episode 60/500, Avg Score: -0.63, Epsilon: 0.6187
Episode 80/500, Avg Score: -0.70, Epsilon: 0.5280
Episode 100/500, Avg Score: -0.57, Epsilon: 0.4501
Episode 120/500, Avg Score: -0.51, Epsilon: 0.3836
Episode 140/500, Avg Score: -0.48, Epsilon: 0.3268
Episode 160/500, Avg Score: -0.60, Epsilon: 0.2785
Episode 180/500, Avg Score: -0.47, Epsilon: 0.2374
Episode 200/500, Avg Score: -0.56, Epsilon: 0.2023
Episode 220/500, Avg Score: -0.42, Epsilon: 0.1724
Episode 240/500, Avg Score: -0.44, Epsilon: 0.1469
Episode 260/500, Avg Score: -0.38, Epsilon: 0.1252
Episode 280/500, Avg Score: -0.41, Epsilon: 0.1067
Episode 300/500, Avg Score: -0.43, Epsilon: 0.0909
Episode 320/500, Avg Score: -0.41, Epsilon: 0.0774
Episode 340/500, Avg Score: -0.47, Epsilon: 0.0660
Episode 360/500, Avg Score: -0.34, Epsilon: 0.0562
Episod

In [81]:
from training_claude import load_agent
import numpy as np

# Example of loading and using the model
loaded_agent = load_agent('shared_dqn_agent.pth')

# Example of getting an action from a state
# The state should be a dictionary with the expected keys
state = {
    "control_stop_idx": np.array([0]),      # Make sure this is an array
    "n_requests": np.array([2]),            # Make sure this is an array
    "headway": np.array([700.5]),            # Make sure this is an array
    "schedule_deviation": np.array([10])  # Make sure this is an array
}
vehicle_idx = 0  # Example vehicle index

# Get action from loaded model
action = loaded_agent.act(state, eval_mode=True)
print(f"For state {state}, the model recommends action: {action}")

For state {'control_stop_idx': array([0]), 'n_requests': array([2]), 'headway': array([700.5]), 'schedule_deviation': array([10])}, the model recommends action: 0


In [36]:
from rl_env import FlexSimEnv
import pandas as pd

In [83]:
def evaluate_agent(env, agent, num_episodes=10):
    """Evaluate the agent's performance in the environment."""
    results = {'pax': [], 'vehicles': [], 'state': [], 'idle': []}

    np.random.seed(0)
    for episode in range(num_episodes):
        # Initialize state tracking for each vehicle
        vehicle_observations = {}
        vehicle_actions = {}  # Track previous actions
        
        # Start the episode
        next_observation, info = env.reset()
        vehicle_idx = info['veh_idx']

        # update observation
        observation = next_observation
        vehicle_observations[vehicle_idx] = observation
        
        # select action
        action = agent.act(observation, vehicle_idx)
        vehicle_actions[vehicle_idx] = action

        # take action in environment
        next_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        while not done:
            # Get current vehicle index
            vehicle_idx = info['veh_idx']
            
            # update observation
            observation = next_observation
            vehicle_observations[vehicle_idx] = observation

            # Select action using shared policy
            action = agent.act(observation, vehicle_idx)
            vehicle_actions[vehicle_idx] = action
            
            # Take action in environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        # recordings
        history = env.env.get_history()
        for key in history:
            history[key]['scenario'] = 'RL'
            history[key]['episode'] = episode
            results[key].append(history[key])
    for df_key in results:
        results[df_key] = pd.concat(results[df_key])
    return results

In [55]:
from params import *

In [84]:
env = FlexSimEnv()
results = evaluate_agent(env, loaded_agent, num_episodes=40)

In [85]:
from utils import create_field_from_list_column

In [93]:
state = results['state'].copy()
state = state[state['time'].between(RESULTS_START_TIME_MINUTES*60, RESULTS_END_TIME_MINUTES*60)]
state['unweighted_rewards'] = state['unweighted_rewards'].astype(str)
create_field_from_list_column(state, 0, 'skipped_requests', field_name='unweighted_rewards')
create_field_from_list_column(state, 1, 'off_schedule_trips', field_name='unweighted_rewards')
state['weighted_reward'] = state['skipped_requests'] * -1.0 + state['off_schedule_trips'] * -4.0

In [95]:
state.groupby(['scenario'])['action'].sum()

scenario
RL    718
Name: action, dtype: int64

In [96]:
state['skipped_requests'].sum()

1327.0

In [98]:
state['off_schedule_trips'].sum()

111.0

In [99]:
state.shape

(2360, 11)

In [101]:
state['weighted_reward'].sum()

-1771.0

In [26]:
recorded_actions = []
recorded_observations = []
rl_env = FlexSimEnv()

observation, info = rl_env.reset()
vehicle_idx = info['veh_idx']
action = loaded_agent.act(observation, vehicle_idx, eval_mode=True)

observation, reward, terminated, truncated, info = rl_env.step(action=action)
while not terminated:
    # Use the loaded agent to predict the action based on current observation
    action = loaded_agent.act(observation, vehicle_idx, eval_mode=True)
    recorded_actions.append(action)
    recorded_observations.append(observation)
    observation, reward, terminated, truncated, info = rl_env.step(action=action)

In [27]:
for item in rl_env.env.route.inter_event:
    print(item)

{'skipped_requests': 0, 'off_schedule_trips': 0}
{'skipped_requests': 0, 'off_schedule_trips': 0}


In [28]:
zero_idxs = [i for i in range(len(recorded_actions)) if recorded_actions[i] == 0]

In [29]:
zero_idxs

[0,
 3,
 4,
 5,
 6,
 7,
 8,
 10,
 11,
 12,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 22,
 23,
 24,
 26,
 27,
 28,
 32,
 33,
 34,
 35,
 36,
 37,
 40,
 41,
 44,
 45,
 48,
 49,
 50,
 52,
 55,
 58,
 59,
 60,
 61,
 62,
 63,
 64]

In [None]:
rl_env = FlexSimEnv()

observation, info = rl_env.reset()
action = model.predict(observation, deterministic=True)
action = int(action[0])
observation, reward, terminated, truncated, info = rl_env.step(action=action)
while not terminated:
    # Use the loaded agent to predict the action based on current observation
    action = model.predict(observation, deterministic=True)
    action = int(action[0])
    observation, reward, terminated, truncated, info = rl_env.step(action=action)

history = rl_env.env.get_history()
for key in history:
    history[key]['scenario'] = 'RL_' + model_name
    history[key]['episode'] = i
    results[key].append(history[key])