In [1]:
from rl_env import FlexSimEnv
import numpy as np
import pandas as pd

## Training with best parameters

In [2]:
from training_claude import train_vehicles

In [3]:
# function to receive a float and return a string with '_' instead of '.'
def float_to_str(f):
    return str(f).replace('.', '_')

In [4]:
learning_rate = 6e-3
gamma = 0.995
n_steps = 12_000

reward_weights = [-2.0, -3.0, -5.0]
demand_scenario = ['peak', 'off_peak']

for i in range(len(reward_weights)):
    for j in range(len(demand_scenario)):
        model_path = f'models/{demand_scenario[j]}/dqn_{float_to_str(reward_weights[i])}.pth'
        train_vehicles(reward_weight=reward_weights[i], 
                    learning_rate=learning_rate,
                    gamma=gamma,
                    n_steps=n_steps,
                    save_path=model_path,
                    demand_scenario=demand_scenario[j],)

# change parameters for the larger weights

learning_rate = 6e-3
gamma = 0.995
n_steps = 14_000

reward_weights = [-7.0]
demand_scenario = ['peak', 'off_peak']

for i in range(len(reward_weights)):
    for j in range(len(demand_scenario)):
        model_path = f'models/{demand_scenario[j]}/dqn_{float_to_str(reward_weights[i])}.pth'
        train_vehicles(reward_weight=reward_weights[i], 
                    learning_rate=learning_rate,
                    gamma=gamma,
                    n_steps=n_steps,
                    save_path=model_path,
                    demand_scenario=demand_scenario[j],)

Training with shared policy...

Training complete! Final average reward: -1.66

Model saved as models/peak/dqn_-2_0.pth
Training with shared policy...

Training complete! Final average reward: -1.89

Model saved as models/off_peak/dqn_-2_0.pth
Training with shared policy...

Training complete! Final average reward: -1.52

Model saved as models/peak/dqn_-3_0.pth
Training with shared policy...

Training complete! Final average reward: -1.74

Model saved as models/off_peak/dqn_-3_0.pth
Training with shared policy...

Training complete! Final average reward: -1.59

Model saved as models/peak/dqn_-5_0.pth
Training with shared policy...

Training complete! Final average reward: -1.82

Model saved as models/off_peak/dqn_-5_0.pth
Training with shared policy...

Training complete! Final average reward: -1.62

Model saved as models/peak/dqn_-7_0.pth
Training with shared policy...

Training complete! Final average reward: -1.88

Model saved as models/off_peak/dqn_-7_0.pth


## grid search

In [4]:
from training_claude import grid_search_dqn
import pandas as pd

In [5]:
grid_search_results = grid_search_dqn(
    lr_values=[6e-3], # learning rate
    n_steps_values=[12_000, 14_000, 16_000], # timesteps
    gamma_values=[0.995], # discount factor
    reward_weights = [-2.0, -3.0, -5.0, -7.0], # reward weights
    demand_scenarios=['peak', 'off_peak'],
    verbose=True
)

Training with shared policy...

Training complete! Final average reward: -1.61

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.995, reward_weight=-2.0 demand_scenario=peak
Reward: -0.99 +/- 1.646
------------------------
Training with shared policy...

Training complete! Final average reward: -1.78

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.995, reward_weight=-2.0 demand_scenario=off_peak
Reward: -1.118 +/- 1.809
------------------------
Training with shared policy...

Training complete! Final average reward: -1.58

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.995, reward_weight=-3.0 demand_scenario=peak
Reward: -1.001 +/- 1.597
------------------------
Training with shared policy...

Training complete! Final average reward: -1.71

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.995, reward_weight=-3.0 demand_scenario=off_peak
Reward: -1.034 +/- 1.719
------------------------
Training with shared policy...

Training complete

In [8]:
grid_search_df = pd.DataFrame(grid_search_results)

In [None]:
# grid_search_df.groupby(['demand_scenario'])['avg_picked_requests'].describe()

In [9]:
grid_search_df.sort_values(by=['weight', 'demand_scenario','mean_reward'])

Unnamed: 0,weight,lr,n_steps,gamma,demand_scenario,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
7,-7.0,0.006,12000,0.995,off_peak,-1.019,1.704,56.6,35.2,1.9,11.2,9.7
15,-7.0,0.006,14000,0.995,off_peak,-0.994,1.699,57.3,35.9,1.8,10.4,9.7
23,-7.0,0.006,16000,0.995,off_peak,-0.981,1.561,58.3,29.8,2.0,12.5,5.3
6,-7.0,0.006,12000,0.995,peak,-1.116,1.877,55.7,39.8,1.7,9.0,12.6
22,-7.0,0.006,16000,0.995,peak,-1.052,1.709,58.8,33.5,1.9,11.4,8.5
14,-7.0,0.006,14000,0.995,peak,-1.036,1.697,58.5,35.2,1.9,10.5,8.6
5,-5.0,0.006,12000,0.995,off_peak,-1.146,1.865,57.0,39.8,1.7,10.1,13.1
21,-5.0,0.006,16000,0.995,off_peak,-1.13,1.834,56.3,37.8,1.7,10.5,11.9
13,-5.0,0.006,14000,0.995,off_peak,-1.003,1.703,57.8,34.1,1.9,11.0,9.3
20,-5.0,0.006,16000,0.995,peak,-1.1,1.86,56.0,39.6,1.8,9.1,12.6


In [None]:
# grid_search_df.to_csv('outputs/grid_search_results.csv', index=False)

## test default training

In [1]:
# train
from training_claude import train_vehicles

agent, scores = train_vehicles(reward_weights={'off_schedule_trips': -4.0, 'skipped_requests': -1.0})

Training with shared policy...
Episode 0/500, Avg Score: -0.48, Epsilon: 1.0000
Episode 20/500, Avg Score: -0.69, Epsilon: 0.8521
Episode 40/500, Avg Score: -0.66, Epsilon: 0.7261
Episode 60/500, Avg Score: -0.63, Epsilon: 0.6187
Episode 80/500, Avg Score: -0.70, Epsilon: 0.5280
Episode 100/500, Avg Score: -0.57, Epsilon: 0.4501
Episode 120/500, Avg Score: -0.51, Epsilon: 0.3836
Episode 140/500, Avg Score: -0.48, Epsilon: 0.3268
Episode 160/500, Avg Score: -0.60, Epsilon: 0.2785
Episode 180/500, Avg Score: -0.47, Epsilon: 0.2374
Episode 200/500, Avg Score: -0.56, Epsilon: 0.2023
Episode 220/500, Avg Score: -0.42, Epsilon: 0.1724
Episode 240/500, Avg Score: -0.44, Epsilon: 0.1469
Episode 260/500, Avg Score: -0.38, Epsilon: 0.1252
Episode 280/500, Avg Score: -0.41, Epsilon: 0.1067
Episode 300/500, Avg Score: -0.43, Epsilon: 0.0909
Episode 320/500, Avg Score: -0.41, Epsilon: 0.0774
Episode 340/500, Avg Score: -0.47, Epsilon: 0.0660
Episode 360/500, Avg Score: -0.34, Epsilon: 0.0562
Episod

## TODO: turn this into a function that by control stop, and number of requests creates scatterplots for headway and schedule deviation

In [226]:
from training_claude import load_agent
import numpy as np

# Example of loading and using the model
loaded_agent = load_agent('shared_dqn_agent.pth')

# Example of getting an action from a state
# The state should be a dictionary with the expected keys
state = {
    "control_stop_idx": np.array([0]),      # Make sure this is an array
    "n_requests": np.array([2]),            # Make sure this is an array
    "headway": np.array([700.5]),            # Make sure this is an array
    "schedule_deviation": np.array([10])  # Make sure this is an array
}
vehicle_idx = 0  # Example vehicle index

# Get action from loaded model
action = loaded_agent.act(state, eval_mode=True)
print(f"For state {state}, the model recommends action: {action}")

For state {'control_stop_idx': array([0]), 'n_requests': array([2]), 'headway': array([700.5]), 'schedule_deviation': array([10])}, the model recommends action: 0


In [227]:
from rl_env import FlexSimEnv
import pandas as pd

In [228]:
def evaluate_agent(env, agent, num_episodes=10):
    """Evaluate the agent's performance in the environment."""
    results = {'pax': [], 'vehicles': [], 'state': [], 'idle': []}

    np.random.seed(0)
    for episode in range(num_episodes):
        # Initialize state tracking for each vehicle
        vehicle_observations = {}
        vehicle_actions = {}  # Track previous actions
        
        # Start the episode
        next_observation, info = env.reset()
        vehicle_idx = info['veh_idx']

        # update observation
        observation = next_observation
        vehicle_observations[vehicle_idx] = observation
        
        # select action
        action = agent.act(observation, vehicle_idx)
        vehicle_actions[vehicle_idx] = action

        # take action in environment
        next_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        while not done:
            # Get current vehicle index
            vehicle_idx = info['veh_idx']
            
            # update observation
            observation = next_observation
            vehicle_observations[vehicle_idx] = observation

            # Select action using shared policy
            action = agent.act(observation, vehicle_idx)
            vehicle_actions[vehicle_idx] = action
            
            # Take action in environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        # recordings
        history = env.env.get_history()
        for key in history:
            history[key]['scenario'] = 'RL'
            history[key]['episode'] = episode
            results[key].append(history[key])
    for df_key in results:
        results[df_key] = pd.concat(results[df_key])
    return results

In [229]:
from params import *

In [231]:
env = FlexSimEnv()
results = evaluate_agent(env, loaded_agent, num_episodes=40)

In [85]:
from utils import create_field_from_list_column

In [93]:
state = results['state'].copy()
state = state[state['time'].between(RESULTS_START_TIME_MINUTES*60, RESULTS_END_TIME_MINUTES*60)]
state['unweighted_rewards'] = state['unweighted_rewards'].astype(str)
create_field_from_list_column(state, 0, 'skipped_requests', field_name='unweighted_rewards')
create_field_from_list_column(state, 1, 'off_schedule_trips', field_name='unweighted_rewards')
state['weighted_reward'] = state['skipped_requests'] * -1.0 + state['off_schedule_trips'] * -4.0

In [236]:
state_raw = results['state'].copy()
# state_raw.groupby(['episode']).size().head() # confirm it's 65 control steps per episode
num_time_steps = 10_000
time_steps_per_episode = 65
num_episodes = int(num_time_steps/time_steps_per_episode)

In [237]:
num_episodes

153