In [6]:
from rl_env import FlexSimEnv
import numpy as np
import pandas as pd

## Training with best parameters

In [17]:
from training_claude import train_vehicles

In [18]:
learning_rate = 5e-3
gamma = 0.98
n_steps = 12_000

# train with weight -6.0 and save 
agent, score = train_vehicles(
    learning_rate=learning_rate,
    gamma=gamma,
    n_steps=n_steps,
    reward_weight=-2.0,
    save_path='models/dqn_weight_2.pth',
)

# train with weight -5.0 and save
agent, score = train_vehicles(
    learning_rate=learning_rate,
    gamma=gamma,
    n_steps=n_steps,
    reward_weight=-2.5,
    save_path='models/dqn_weight_2_5.pth',
)

# train with weight -5.0 and save
agent, score = train_vehicles(
    learning_rate=learning_rate,
    gamma=gamma,
    n_steps=n_steps,
    reward_weight=-3.0,
    save_path='models/dqn_weight_3.pth',
)

Training with shared policy...

Training complete! Final average reward: -1.02

Model saved as models/dqn_weight_2.pth
Training with shared policy...

Training complete! Final average reward: -1.18

Model saved as models/dqn_weight_2_5.pth
Training with shared policy...

Training complete! Final average reward: -1.27

Model saved as models/dqn_weight_3.pth


## Slope evaluations

In [4]:
from eval import evaluate_slopes

In [5]:
reward_weights = [-2.0, -2.5, -3.0]
slope_results = evaluate_slopes(reward_weights=reward_weights)

Evaluating slope 0.0, reward_weight -2.0
Evaluating slope 0.0, reward_weight -2.5
Evaluating slope 0.0, reward_weight -3.0
Evaluating slope 0.5, reward_weight -2.0
Evaluating slope 0.5, reward_weight -2.5
Evaluating slope 0.5, reward_weight -3.0
Evaluating slope 1.0, reward_weight -2.0
Evaluating slope 1.0, reward_weight -2.5
Evaluating slope 1.0, reward_weight -3.0
Evaluating slope 1.5, reward_weight -2.0
Evaluating slope 1.5, reward_weight -2.5
Evaluating slope 1.5, reward_weight -3.0
Evaluating slope 2.0, reward_weight -2.0
Evaluating slope 2.0, reward_weight -2.5
Evaluating slope 2.0, reward_weight -3.0
Evaluating slope 2.5, reward_weight -2.0
Evaluating slope 2.5, reward_weight -2.5
Evaluating slope 2.5, reward_weight -3.0
Evaluating slope 3.0, reward_weight -2.0
Evaluating slope 3.0, reward_weight -2.5
Evaluating slope 3.0, reward_weight -3.0


In [7]:
slope_df = pd.DataFrame(slope_results)

In [9]:
# pd.DataFrame(slope_results).sort_values(by=['slope','mean_reward'])

## grid search

In [11]:
from training_claude import grid_search_dqn

In [12]:
grid_search_results = grid_search_dqn(
    lr_values=[5e-3], # learning rate
    n_steps_values=[12_000,18_000], # timesteps
    gamma_values=[0.98, 0.99], # discount factor
    reward_weights=[-2.0, -2.5, -3.0], # reward weights
    verbose=True
)

Training with shared policy...

Training complete! Final average reward: -0.97

Evaluation summary:
Params: lr=0.005, n_steps=12000, gamma=0.98, reward_weight=-2.0
Reward: -0.857 +/- 1.328
------------------------
Training with shared policy...

Training complete! Final average reward: -1.19

Evaluation summary:
Params: lr=0.005, n_steps=12000, gamma=0.98, reward_weight=-2.5
Reward: -0.622 +/- 1.103
------------------------
Training with shared policy...

Training complete! Final average reward: -1.26

Evaluation summary:
Params: lr=0.005, n_steps=12000, gamma=0.98, reward_weight=-3.0
Reward: -0.71 +/- 1.432
------------------------
Training with shared policy...

Training complete! Final average reward: -0.95

Evaluation summary:
Params: lr=0.005, n_steps=12000, gamma=0.99, reward_weight=-2.0
Reward: -0.53 +/- 1.083
------------------------
Training with shared policy...

Training complete! Final average reward: -1.20

Evaluation summary:
Params: lr=0.005, n_steps=12000, gamma=0.99, r

In [13]:
grid_search_df = pd.DataFrame(grid_search_results)

In [14]:
slope_df.sort_values(by=['reward_weight','mean_reward'])

Unnamed: 0,slope,reward_weight,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
2,0.0,-3.0,-1.322,1.993,35.6,35.6,1.5,6.9,23.7
5,0.5,-3.0,-1.08,1.867,36.3,34.5,1.5,6.9,17.9
8,1.0,-3.0,-0.908,1.687,38.1,34.1,1.6,6.5,14.1
11,1.5,-3.0,-0.824,1.605,38.8,33.0,1.6,6.6,11.0
14,2.0,-3.0,-0.785,1.533,38.6,31.9,1.7,6.6,9.6
20,3.0,-3.0,-0.739,1.507,39.2,31.8,1.6,7.0,8.1
17,2.5,-3.0,-0.731,1.505,38.7,31.6,1.6,6.3,8.5
1,0.0,-2.5,-1.101,1.661,35.6,35.6,1.5,6.9,23.7
4,0.5,-2.5,-0.905,1.568,36.3,34.5,1.5,6.9,17.9
7,1.0,-2.5,-0.768,1.423,38.1,34.1,1.6,6.5,14.1


In [15]:
grid_search_df.sort_values(by=['weight','mean_reward'])

Unnamed: 0,weight,lr,n_steps,gamma,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
8,-3.0,0.005,18000,0.98,-1.604,1.545,44.3,8.8,1.6,13.2,1.6
11,-3.0,0.005,18000,0.99,-1.422,2.059,34.4,34.4,1.6,6.7,25.1
5,-3.0,0.005,12000,0.99,-0.842,1.305,40.9,21.1,1.8,7.9,2.2
2,-3.0,0.005,12000,0.98,-0.71,1.432,37.8,29.8,1.6,7.4,6.5
7,-2.5,0.005,18000,0.98,-1.114,1.726,36.8,35.4,1.5,5.5,24.6
4,-2.5,0.005,12000,0.99,-0.975,1.604,35.4,35.4,1.5,5.7,21.6
1,-2.5,0.005,12000,0.98,-0.622,1.103,40.9,27.5,1.7,6.4,3.8
10,-2.5,0.005,18000,0.99,-0.531,1.035,38.4,27.3,1.7,6.2,4.2
9,-2.0,0.005,18000,0.99,-0.946,1.37,33.6,33.6,1.6,7.2,25.0
0,-2.0,0.005,12000,0.98,-0.857,1.328,35.2,35.2,1.5,6.1,24.0


## test default training

In [1]:
# train
from training_claude import train_vehicles

agent, scores = train_vehicles(reward_weights={'off_schedule_trips': -4.0, 'skipped_requests': -1.0})

Training with shared policy...
Episode 0/500, Avg Score: -0.48, Epsilon: 1.0000
Episode 20/500, Avg Score: -0.69, Epsilon: 0.8521
Episode 40/500, Avg Score: -0.66, Epsilon: 0.7261
Episode 60/500, Avg Score: -0.63, Epsilon: 0.6187
Episode 80/500, Avg Score: -0.70, Epsilon: 0.5280
Episode 100/500, Avg Score: -0.57, Epsilon: 0.4501
Episode 120/500, Avg Score: -0.51, Epsilon: 0.3836
Episode 140/500, Avg Score: -0.48, Epsilon: 0.3268
Episode 160/500, Avg Score: -0.60, Epsilon: 0.2785
Episode 180/500, Avg Score: -0.47, Epsilon: 0.2374
Episode 200/500, Avg Score: -0.56, Epsilon: 0.2023
Episode 220/500, Avg Score: -0.42, Epsilon: 0.1724
Episode 240/500, Avg Score: -0.44, Epsilon: 0.1469
Episode 260/500, Avg Score: -0.38, Epsilon: 0.1252
Episode 280/500, Avg Score: -0.41, Epsilon: 0.1067
Episode 300/500, Avg Score: -0.43, Epsilon: 0.0909
Episode 320/500, Avg Score: -0.41, Epsilon: 0.0774
Episode 340/500, Avg Score: -0.47, Epsilon: 0.0660
Episode 360/500, Avg Score: -0.34, Epsilon: 0.0562
Episod

## TODO: turn this into a function that by control stop, and number of requests creates scatterplots for headway and schedule deviation

In [226]:
from training_claude import load_agent
import numpy as np

# Example of loading and using the model
loaded_agent = load_agent('shared_dqn_agent.pth')

# Example of getting an action from a state
# The state should be a dictionary with the expected keys
state = {
    "control_stop_idx": np.array([0]),      # Make sure this is an array
    "n_requests": np.array([2]),            # Make sure this is an array
    "headway": np.array([700.5]),            # Make sure this is an array
    "schedule_deviation": np.array([10])  # Make sure this is an array
}
vehicle_idx = 0  # Example vehicle index

# Get action from loaded model
action = loaded_agent.act(state, eval_mode=True)
print(f"For state {state}, the model recommends action: {action}")

For state {'control_stop_idx': array([0]), 'n_requests': array([2]), 'headway': array([700.5]), 'schedule_deviation': array([10])}, the model recommends action: 0


In [227]:
from rl_env import FlexSimEnv
import pandas as pd

In [228]:
def evaluate_agent(env, agent, num_episodes=10):
    """Evaluate the agent's performance in the environment."""
    results = {'pax': [], 'vehicles': [], 'state': [], 'idle': []}

    np.random.seed(0)
    for episode in range(num_episodes):
        # Initialize state tracking for each vehicle
        vehicle_observations = {}
        vehicle_actions = {}  # Track previous actions
        
        # Start the episode
        next_observation, info = env.reset()
        vehicle_idx = info['veh_idx']

        # update observation
        observation = next_observation
        vehicle_observations[vehicle_idx] = observation
        
        # select action
        action = agent.act(observation, vehicle_idx)
        vehicle_actions[vehicle_idx] = action

        # take action in environment
        next_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        while not done:
            # Get current vehicle index
            vehicle_idx = info['veh_idx']
            
            # update observation
            observation = next_observation
            vehicle_observations[vehicle_idx] = observation

            # Select action using shared policy
            action = agent.act(observation, vehicle_idx)
            vehicle_actions[vehicle_idx] = action
            
            # Take action in environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        # recordings
        history = env.env.get_history()
        for key in history:
            history[key]['scenario'] = 'RL'
            history[key]['episode'] = episode
            results[key].append(history[key])
    for df_key in results:
        results[df_key] = pd.concat(results[df_key])
    return results

In [229]:
from params import *

In [231]:
env = FlexSimEnv()
results = evaluate_agent(env, loaded_agent, num_episodes=40)

In [85]:
from utils import create_field_from_list_column

In [93]:
state = results['state'].copy()
state = state[state['time'].between(RESULTS_START_TIME_MINUTES*60, RESULTS_END_TIME_MINUTES*60)]
state['unweighted_rewards'] = state['unweighted_rewards'].astype(str)
create_field_from_list_column(state, 0, 'skipped_requests', field_name='unweighted_rewards')
create_field_from_list_column(state, 1, 'off_schedule_trips', field_name='unweighted_rewards')
state['weighted_reward'] = state['skipped_requests'] * -1.0 + state['off_schedule_trips'] * -4.0

In [236]:
state_raw = results['state'].copy()
# state_raw.groupby(['episode']).size().head() # confirm it's 65 control steps per episode
num_time_steps = 10_000
time_steps_per_episode = 65
num_episodes = int(num_time_steps/time_steps_per_episode)

In [237]:
num_episodes

153

## toy with environment

In [6]:
rl_env  = FlexSimEnv()

In [None]:
rl_env.route.

<rl_env.FlexSimEnv at 0x171bcb690>

In [None]:
rl_env.reset()

({'control_stop_idx': array([0], dtype=int32),
  'n_requests': array([0], dtype=int32),
  'headway': array([594.], dtype=float32),
  'schedule_deviation': array([-55.], dtype=float32)},
 {'skipped_requests': 0,
  'off_schedule_trips': 0,
  'time': 705.0,
  'veh_idx': 1,
  'direction': 'out'})

In [None]:
rl_env.step(action=0)

({'control_stop_idx': array([1], dtype=int32),
  'n_requests': array([1], dtype=int32),
  'headway': array([495.], dtype=float32),
  'schedule_deviation': array([-75.], dtype=float32)},
 -3.0,
 0,
 0,
 {'skipped_requests': 3,
  'off_schedule_trips': 0,
  'time': 6837.0,
  'veh_idx': 1,
  'direction': 'out'})

In [None]:
from params import REWARD_WEIGHTS
REWARD_WEIGHTS

{'skipped_requests': -1.0, 'off_schedule_trips': -1.0}