In [1]:
from rl_env import FlexSimEnv
import numpy as np
import pandas as pd

## Training with best parameters

In [8]:
from training_claude import train_vehicles

In [9]:
learning_rate = 5e-3
gamma = 0.98
n_steps = 18_000

reward_weights = [-4.5, -5.0, -5.5]
model_paths = ['models/dqn_weight_4_5.pth', 
               'models/dqn_weight_5_0.pth',
               'models/dqn_weight_5_5.pth']
env = FlexSimEnv()
for i in range(len(reward_weights)):
    train_vehicles(reward_weight=reward_weights[i], 
                   learning_rate=learning_rate,
                   gamma=gamma,
                   n_steps=n_steps,
                   save_path=model_paths[i],)

Training with shared policy...

Training complete! Final average reward: -2.46

Model saved as models/dqn_weight_4_5.pth
Training with shared policy...

Training complete! Final average reward: -2.57

Model saved as models/dqn_weight_5_0.pth
Training with shared policy...

Training complete! Final average reward: -2.69

Model saved as models/dqn_weight_5_5.pth


## Slope evaluations

In [4]:
from eval import evaluate_slopes

In [5]:
reward_weights = [-2.0, -2.5, -3.0]
slope_results = evaluate_slopes(reward_weights=reward_weights)

Evaluating slope 0.0, reward_weight -2.0
Evaluating slope 0.0, reward_weight -2.5
Evaluating slope 0.0, reward_weight -3.0
Evaluating slope 0.5, reward_weight -2.0
Evaluating slope 0.5, reward_weight -2.5
Evaluating slope 0.5, reward_weight -3.0
Evaluating slope 1.0, reward_weight -2.0
Evaluating slope 1.0, reward_weight -2.5
Evaluating slope 1.0, reward_weight -3.0
Evaluating slope 1.5, reward_weight -2.0
Evaluating slope 1.5, reward_weight -2.5
Evaluating slope 1.5, reward_weight -3.0
Evaluating slope 2.0, reward_weight -2.0
Evaluating slope 2.0, reward_weight -2.5
Evaluating slope 2.0, reward_weight -3.0
Evaluating slope 2.5, reward_weight -2.0
Evaluating slope 2.5, reward_weight -2.5
Evaluating slope 2.5, reward_weight -3.0
Evaluating slope 3.0, reward_weight -2.0
Evaluating slope 3.0, reward_weight -2.5
Evaluating slope 3.0, reward_weight -3.0


In [7]:
slope_df = pd.DataFrame(slope_results)

In [9]:
# pd.DataFrame(slope_results).sort_values(by=['slope','mean_reward'])

## grid search

In [1]:
from training_claude import grid_search_dqn

In [2]:
grid_search_results = grid_search_dqn(
    lr_values=[6e-3], # learning rate
    n_steps_values=[12_000, 16_000, 18_000], # timesteps
    gamma_values=[0.985, 0.995], # discount factor
    reward_weights = [-3.0, -3.2, -3.5], # reward weights
    verbose=True
)

Training with shared policy...

Training complete! Final average reward: -1.72

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-3.0
Reward: -1.285 +/- 1.475
------------------------
Training with shared policy...

Training complete! Final average reward: -1.66

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-3.2
Reward: -2.568 +/- 2.533
------------------------
Training with shared policy...

Training complete! Final average reward: -1.78

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-3.5
Reward: -1.075 +/- 1.458
------------------------
Training with shared policy...

Training complete! Final average reward: -1.73

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.995, reward_weight=-3.0
Reward: -2.682 +/- 2.408
------------------------
Training with shared policy...

Training complete! Final average reward: -1.67

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0

In [5]:
grid_search_df = pd.DataFrame(grid_search_results)

In [6]:
grid_search_df.sort_values(by=['weight', 'mean_reward'])

Unnamed: 0,weight,lr,n_steps,gamma,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
5,-3.5,0.006,12000,0.995,-2.649,2.737,38.7,38.7,1.7,4.4,49.1
14,-3.5,0.006,18000,0.985,-2.643,2.725,39.6,39.6,1.8,4.3,48.2
17,-3.5,0.006,18000,0.995,-2.617,2.763,43.8,41.9,1.7,4.4,51.4
11,-3.5,0.006,16000,0.995,-1.333,2.194,56.2,43.6,1.7,4.5,25.3
2,-3.5,0.006,12000,0.985,-1.075,1.458,60.8,25.6,1.9,8.7,3.0
8,-3.5,0.006,16000,0.985,-0.897,1.564,58.7,34.5,1.9,6.4,8.1
16,-3.2,0.006,18000,0.995,-2.694,2.733,41.4,35.3,1.8,4.4,52.9
13,-3.2,0.006,18000,0.985,-2.648,2.539,37.6,37.6,1.8,4.3,52.4
1,-3.2,0.006,12000,0.985,-2.568,2.533,40.7,40.7,1.7,4.2,53.4
4,-3.2,0.006,12000,0.995,-2.533,2.577,44.6,41.8,1.7,4.8,56.1


In [21]:
grid_search_df.to_csv('outputs/grid_search_results.csv', index=False)

## test default training

In [1]:
# train
from training_claude import train_vehicles

agent, scores = train_vehicles(reward_weights={'off_schedule_trips': -4.0, 'skipped_requests': -1.0})

Training with shared policy...
Episode 0/500, Avg Score: -0.48, Epsilon: 1.0000
Episode 20/500, Avg Score: -0.69, Epsilon: 0.8521
Episode 40/500, Avg Score: -0.66, Epsilon: 0.7261
Episode 60/500, Avg Score: -0.63, Epsilon: 0.6187
Episode 80/500, Avg Score: -0.70, Epsilon: 0.5280
Episode 100/500, Avg Score: -0.57, Epsilon: 0.4501
Episode 120/500, Avg Score: -0.51, Epsilon: 0.3836
Episode 140/500, Avg Score: -0.48, Epsilon: 0.3268
Episode 160/500, Avg Score: -0.60, Epsilon: 0.2785
Episode 180/500, Avg Score: -0.47, Epsilon: 0.2374
Episode 200/500, Avg Score: -0.56, Epsilon: 0.2023
Episode 220/500, Avg Score: -0.42, Epsilon: 0.1724
Episode 240/500, Avg Score: -0.44, Epsilon: 0.1469
Episode 260/500, Avg Score: -0.38, Epsilon: 0.1252
Episode 280/500, Avg Score: -0.41, Epsilon: 0.1067
Episode 300/500, Avg Score: -0.43, Epsilon: 0.0909
Episode 320/500, Avg Score: -0.41, Epsilon: 0.0774
Episode 340/500, Avg Score: -0.47, Epsilon: 0.0660
Episode 360/500, Avg Score: -0.34, Epsilon: 0.0562
Episod

## TODO: turn this into a function that by control stop, and number of requests creates scatterplots for headway and schedule deviation

In [226]:
from training_claude import load_agent
import numpy as np

# Example of loading and using the model
loaded_agent = load_agent('shared_dqn_agent.pth')

# Example of getting an action from a state
# The state should be a dictionary with the expected keys
state = {
    "control_stop_idx": np.array([0]),      # Make sure this is an array
    "n_requests": np.array([2]),            # Make sure this is an array
    "headway": np.array([700.5]),            # Make sure this is an array
    "schedule_deviation": np.array([10])  # Make sure this is an array
}
vehicle_idx = 0  # Example vehicle index

# Get action from loaded model
action = loaded_agent.act(state, eval_mode=True)
print(f"For state {state}, the model recommends action: {action}")

For state {'control_stop_idx': array([0]), 'n_requests': array([2]), 'headway': array([700.5]), 'schedule_deviation': array([10])}, the model recommends action: 0


In [227]:
from rl_env import FlexSimEnv
import pandas as pd

In [228]:
def evaluate_agent(env, agent, num_episodes=10):
    """Evaluate the agent's performance in the environment."""
    results = {'pax': [], 'vehicles': [], 'state': [], 'idle': []}

    np.random.seed(0)
    for episode in range(num_episodes):
        # Initialize state tracking for each vehicle
        vehicle_observations = {}
        vehicle_actions = {}  # Track previous actions
        
        # Start the episode
        next_observation, info = env.reset()
        vehicle_idx = info['veh_idx']

        # update observation
        observation = next_observation
        vehicle_observations[vehicle_idx] = observation
        
        # select action
        action = agent.act(observation, vehicle_idx)
        vehicle_actions[vehicle_idx] = action

        # take action in environment
        next_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        while not done:
            # Get current vehicle index
            vehicle_idx = info['veh_idx']
            
            # update observation
            observation = next_observation
            vehicle_observations[vehicle_idx] = observation

            # Select action using shared policy
            action = agent.act(observation, vehicle_idx)
            vehicle_actions[vehicle_idx] = action
            
            # Take action in environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        # recordings
        history = env.env.get_history()
        for key in history:
            history[key]['scenario'] = 'RL'
            history[key]['episode'] = episode
            results[key].append(history[key])
    for df_key in results:
        results[df_key] = pd.concat(results[df_key])
    return results

In [229]:
from params import *

In [231]:
env = FlexSimEnv()
results = evaluate_agent(env, loaded_agent, num_episodes=40)

In [85]:
from utils import create_field_from_list_column

In [93]:
state = results['state'].copy()
state = state[state['time'].between(RESULTS_START_TIME_MINUTES*60, RESULTS_END_TIME_MINUTES*60)]
state['unweighted_rewards'] = state['unweighted_rewards'].astype(str)
create_field_from_list_column(state, 0, 'skipped_requests', field_name='unweighted_rewards')
create_field_from_list_column(state, 1, 'off_schedule_trips', field_name='unweighted_rewards')
state['weighted_reward'] = state['skipped_requests'] * -1.0 + state['off_schedule_trips'] * -4.0

In [236]:
state_raw = results['state'].copy()
# state_raw.groupby(['episode']).size().head() # confirm it's 65 control steps per episode
num_time_steps = 10_000
time_steps_per_episode = 65
num_episodes = int(num_time_steps/time_steps_per_episode)

In [237]:
num_episodes

153

## toy with environment

In [6]:
rl_env  = FlexSimEnv()

In [None]:
rl_env.route.

<rl_env.FlexSimEnv at 0x171bcb690>

In [None]:
rl_env.reset()

({'control_stop_idx': array([0], dtype=int32),
  'n_requests': array([0], dtype=int32),
  'headway': array([594.], dtype=float32),
  'schedule_deviation': array([-55.], dtype=float32)},
 {'skipped_requests': 0,
  'off_schedule_trips': 0,
  'time': 705.0,
  'veh_idx': 1,
  'direction': 'out'})

In [None]:
rl_env.step(action=0)

({'control_stop_idx': array([1], dtype=int32),
  'n_requests': array([1], dtype=int32),
  'headway': array([495.], dtype=float32),
  'schedule_deviation': array([-75.], dtype=float32)},
 -3.0,
 0,
 0,
 {'skipped_requests': 3,
  'off_schedule_trips': 0,
  'time': 6837.0,
  'veh_idx': 1,
  'direction': 'out'})

In [None]:
from params import REWARD_WEIGHTS
REWARD_WEIGHTS

{'skipped_requests': -1.0, 'off_schedule_trips': -1.0}