In [7]:
from rl_env import FlexSimEnv
import numpy as np
import pandas as pd

## Training with best parameters

In [12]:
from training_claude import train_vehicles

In [13]:
# function to receive a float and return a string with '_' instead of '.'
def float_to_str(f):
    return str(f).replace('.', '_')

In [14]:
learning_rate = 6e-3
gamma = 0.995
n_steps = 16_000

reward_weights = [-2.7, -3.0, -3.2, -3.5, -3.8]
demand_scenario = ['peak', 'off_peak']

for i in range(len(reward_weights)):
    for j in range(len(demand_scenario)):
        model_path = f'models/{demand_scenario[j]}/dqn_{float_to_str(reward_weights[i])}.pth'
        train_vehicles(reward_weight=reward_weights[i], 
                    learning_rate=learning_rate,
                    gamma=gamma,
                    n_steps=n_steps,
                    save_path=model_path,
                    demand_scenario=demand_scenario[j],)

Training with shared policy...

Training complete! Final average reward: -1.69

Model saved as models/peak/dqn_-2_7.pth
Training with shared policy...

Training complete! Final average reward: -1.91

Model saved as models/off_peak/dqn_-2_7.pth
Training with shared policy...

Training complete! Final average reward: -1.64

Model saved as models/peak/dqn_-3_0.pth
Training with shared policy...

Training complete! Final average reward: -2.09

Model saved as models/off_peak/dqn_-3_0.pth
Training with shared policy...

Training complete! Final average reward: -1.75

Model saved as models/peak/dqn_-3_2.pth
Training with shared policy...

Training complete! Final average reward: -2.00

Model saved as models/off_peak/dqn_-3_2.pth
Training with shared policy...

Training complete! Final average reward: -1.77

Model saved as models/peak/dqn_-3_5.pth
Training with shared policy...

Training complete! Final average reward: -1.95

Model saved as models/off_peak/dqn_-3_5.pth
Training with shared pol

## Slope evaluations

In [4]:
from eval import evaluate_slopes

In [5]:
reward_weights = [-2.0, -2.5, -3.0]
slope_results = evaluate_slopes(reward_weights=reward_weights)

Evaluating slope 0.0, reward_weight -2.0
Evaluating slope 0.0, reward_weight -2.5
Evaluating slope 0.0, reward_weight -3.0
Evaluating slope 0.5, reward_weight -2.0
Evaluating slope 0.5, reward_weight -2.5
Evaluating slope 0.5, reward_weight -3.0
Evaluating slope 1.0, reward_weight -2.0
Evaluating slope 1.0, reward_weight -2.5
Evaluating slope 1.0, reward_weight -3.0
Evaluating slope 1.5, reward_weight -2.0
Evaluating slope 1.5, reward_weight -2.5
Evaluating slope 1.5, reward_weight -3.0
Evaluating slope 2.0, reward_weight -2.0
Evaluating slope 2.0, reward_weight -2.5
Evaluating slope 2.0, reward_weight -3.0
Evaluating slope 2.5, reward_weight -2.0
Evaluating slope 2.5, reward_weight -2.5
Evaluating slope 2.5, reward_weight -3.0
Evaluating slope 3.0, reward_weight -2.0
Evaluating slope 3.0, reward_weight -2.5
Evaluating slope 3.0, reward_weight -3.0


In [7]:
slope_df = pd.DataFrame(slope_results)

In [9]:
# pd.DataFrame(slope_results).sort_values(by=['slope','mean_reward'])

## grid search

In [1]:
from training_claude import grid_search_dqn
import pandas as pd

In [3]:
grid_search_results = grid_search_dqn(
    lr_values=[6e-3], # learning rate
    n_steps_values=[12_000, 16_000, 18_000], # timesteps
    gamma_values=[0.985, 0.995], # discount factor
    reward_weights = [-2.7, -3.0, -3.2, -3.5, -3.8], # reward weights
    demand_scenarios=['peak', 'off_peak'],
    verbose=True
)

Training with shared policy...

Training complete! Final average reward: -1.76

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-2.7 demand_scenario=peak
Reward: -1.653 +/- 2.029
------------------------
Training with shared policy...

Training complete! Final average reward: -1.91

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-2.7 demand_scenario=off_peak
Reward: -0.936 +/- 1.603
------------------------
Training with shared policy...

Training complete! Final average reward: -1.70

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-3.0 demand_scenario=peak
Reward: -1.04 +/- 1.601
------------------------
Training with shared policy...

Training complete! Final average reward: -1.94

Evaluation summary:
Params: lr=0.006, n_steps=12000, gamma=0.985, reward_weight=-3.0 demand_scenario=off_peak
Reward: -1.652 +/- 1.966
------------------------
Training with shared policy...

Training complete

In [4]:
grid_search_df = pd.DataFrame(grid_search_results)

In [8]:
grid_search_df.groupby(['demand_scenario'])['avg_picked_requests'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
demand_scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
off_peak,30.0,1.696667,0.07184,1.6,1.625,1.7,1.7,1.8
peak,30.0,1.686667,0.116658,1.4,1.6,1.7,1.7,2.0


In [6]:
grid_search_df.sort_values(by=['weight', 'demand_scenario','mean_reward'])

Unnamed: 0,weight,lr,n_steps,gamma,demand_scenario,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
59,-3.8,0.006,18000,0.995,off_peak,-1.357,1.89,54.2,39.6,1.7,14.0,16.3
19,-3.8,0.006,12000,0.995,off_peak,-1.162,1.83,54.2,41.3,1.6,13.6,12.5
29,-3.8,0.006,16000,0.985,off_peak,-1.082,1.718,54.4,37.9,1.8,12.5,11.3
49,-3.8,0.006,18000,0.985,off_peak,-1.075,1.743,56.3,38.5,1.7,13.9,9.8
9,-3.8,0.006,12000,0.985,off_peak,-1.005,1.757,55.0,40.9,1.6,12.1,10.5
39,-3.8,0.006,16000,0.995,off_peak,-0.997,1.638,56.5,38.4,1.8,14.0,10.1
58,-3.8,0.006,18000,0.995,peak,-2.094,2.063,46.6,44.9,1.6,10.5,34.6
18,-3.8,0.006,12000,0.995,peak,-1.737,2.109,51.6,38.7,1.7,11.2,25.5
48,-3.8,0.006,18000,0.985,peak,-1.388,2.02,54.5,45.6,1.6,12.0,18.4
28,-3.8,0.006,16000,0.985,peak,-1.358,1.921,57.3,40.7,1.7,12.3,16.5


In [3]:
grid_search_df = pd.DataFrame(grid_search_results)

In [4]:
grid_search_df.sort_values(by=['weight', 'mean_reward'])

Unnamed: 0,weight,lr,n_steps,gamma,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
5,-3.5,0.006,12000,0.995,-1.357,1.978,52.6,43.1,1.6,13.6,16.1
17,-3.5,0.006,18000,0.995,-1.269,1.906,54.9,45.4,1.6,11.9,16.5
14,-3.5,0.006,18000,0.985,-1.222,1.818,55.8,41.3,1.6,12.8,14.3
11,-3.5,0.006,16000,0.995,-1.126,1.857,52.5,42.9,1.6,12.0,13.0
2,-3.5,0.006,12000,0.985,-0.936,1.552,58.2,35.3,1.7,15.0,4.5
8,-3.5,0.006,16000,0.985,-0.875,1.594,54.6,36.6,1.7,14.0,4.8
13,-3.2,0.006,18000,0.985,-1.323,2.001,52.6,44.2,1.6,12.6,16.2
4,-3.2,0.006,12000,0.995,-1.282,1.866,56.0,40.7,1.7,13.9,14.1
10,-3.2,0.006,16000,0.995,-1.219,1.872,56.8,39.8,1.7,13.8,12.0
1,-3.2,0.006,12000,0.985,-1.014,1.503,59.7,30.3,1.8,18.4,2.1


In [6]:
grid_search_df.sort_values(by=['weight', 'mean_reward'])

Unnamed: 0,weight,lr,n_steps,gamma,mean_reward,std_reward,deviation_opportunities,deviations,avg_picked_requests,early_trips,late_trips
5,-3.5,0.006,12000,0.995,-2.649,2.737,38.7,38.7,1.7,4.4,49.1
14,-3.5,0.006,18000,0.985,-2.643,2.725,39.6,39.6,1.8,4.3,48.2
17,-3.5,0.006,18000,0.995,-2.617,2.763,43.8,41.9,1.7,4.4,51.4
11,-3.5,0.006,16000,0.995,-1.333,2.194,56.2,43.6,1.7,4.5,25.3
2,-3.5,0.006,12000,0.985,-1.075,1.458,60.8,25.6,1.9,8.7,3.0
8,-3.5,0.006,16000,0.985,-0.897,1.564,58.7,34.5,1.9,6.4,8.1
16,-3.2,0.006,18000,0.995,-2.694,2.733,41.4,35.3,1.8,4.4,52.9
13,-3.2,0.006,18000,0.985,-2.648,2.539,37.6,37.6,1.8,4.3,52.4
1,-3.2,0.006,12000,0.985,-2.568,2.533,40.7,40.7,1.7,4.2,53.4
4,-3.2,0.006,12000,0.995,-2.533,2.577,44.6,41.8,1.7,4.8,56.1


In [21]:
grid_search_df.to_csv('outputs/grid_search_results.csv', index=False)

## test default training

In [1]:
# train
from training_claude import train_vehicles

agent, scores = train_vehicles(reward_weights={'off_schedule_trips': -4.0, 'skipped_requests': -1.0})

Training with shared policy...
Episode 0/500, Avg Score: -0.48, Epsilon: 1.0000
Episode 20/500, Avg Score: -0.69, Epsilon: 0.8521
Episode 40/500, Avg Score: -0.66, Epsilon: 0.7261
Episode 60/500, Avg Score: -0.63, Epsilon: 0.6187
Episode 80/500, Avg Score: -0.70, Epsilon: 0.5280
Episode 100/500, Avg Score: -0.57, Epsilon: 0.4501
Episode 120/500, Avg Score: -0.51, Epsilon: 0.3836
Episode 140/500, Avg Score: -0.48, Epsilon: 0.3268
Episode 160/500, Avg Score: -0.60, Epsilon: 0.2785
Episode 180/500, Avg Score: -0.47, Epsilon: 0.2374
Episode 200/500, Avg Score: -0.56, Epsilon: 0.2023
Episode 220/500, Avg Score: -0.42, Epsilon: 0.1724
Episode 240/500, Avg Score: -0.44, Epsilon: 0.1469
Episode 260/500, Avg Score: -0.38, Epsilon: 0.1252
Episode 280/500, Avg Score: -0.41, Epsilon: 0.1067
Episode 300/500, Avg Score: -0.43, Epsilon: 0.0909
Episode 320/500, Avg Score: -0.41, Epsilon: 0.0774
Episode 340/500, Avg Score: -0.47, Epsilon: 0.0660
Episode 360/500, Avg Score: -0.34, Epsilon: 0.0562
Episod

## TODO: turn this into a function that by control stop, and number of requests creates scatterplots for headway and schedule deviation

In [226]:
from training_claude import load_agent
import numpy as np

# Example of loading and using the model
loaded_agent = load_agent('shared_dqn_agent.pth')

# Example of getting an action from a state
# The state should be a dictionary with the expected keys
state = {
    "control_stop_idx": np.array([0]),      # Make sure this is an array
    "n_requests": np.array([2]),            # Make sure this is an array
    "headway": np.array([700.5]),            # Make sure this is an array
    "schedule_deviation": np.array([10])  # Make sure this is an array
}
vehicle_idx = 0  # Example vehicle index

# Get action from loaded model
action = loaded_agent.act(state, eval_mode=True)
print(f"For state {state}, the model recommends action: {action}")

For state {'control_stop_idx': array([0]), 'n_requests': array([2]), 'headway': array([700.5]), 'schedule_deviation': array([10])}, the model recommends action: 0


In [227]:
from rl_env import FlexSimEnv
import pandas as pd

In [228]:
def evaluate_agent(env, agent, num_episodes=10):
    """Evaluate the agent's performance in the environment."""
    results = {'pax': [], 'vehicles': [], 'state': [], 'idle': []}

    np.random.seed(0)
    for episode in range(num_episodes):
        # Initialize state tracking for each vehicle
        vehicle_observations = {}
        vehicle_actions = {}  # Track previous actions
        
        # Start the episode
        next_observation, info = env.reset()
        vehicle_idx = info['veh_idx']

        # update observation
        observation = next_observation
        vehicle_observations[vehicle_idx] = observation
        
        # select action
        action = agent.act(observation, vehicle_idx)
        vehicle_actions[vehicle_idx] = action

        # take action in environment
        next_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        while not done:
            # Get current vehicle index
            vehicle_idx = info['veh_idx']
            
            # update observation
            observation = next_observation
            vehicle_observations[vehicle_idx] = observation

            # Select action using shared policy
            action = agent.act(observation, vehicle_idx)
            vehicle_actions[vehicle_idx] = action
            
            # Take action in environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
        # recordings
        history = env.env.get_history()
        for key in history:
            history[key]['scenario'] = 'RL'
            history[key]['episode'] = episode
            results[key].append(history[key])
    for df_key in results:
        results[df_key] = pd.concat(results[df_key])
    return results

In [229]:
from params import *

In [231]:
env = FlexSimEnv()
results = evaluate_agent(env, loaded_agent, num_episodes=40)

In [85]:
from utils import create_field_from_list_column

In [93]:
state = results['state'].copy()
state = state[state['time'].between(RESULTS_START_TIME_MINUTES*60, RESULTS_END_TIME_MINUTES*60)]
state['unweighted_rewards'] = state['unweighted_rewards'].astype(str)
create_field_from_list_column(state, 0, 'skipped_requests', field_name='unweighted_rewards')
create_field_from_list_column(state, 1, 'off_schedule_trips', field_name='unweighted_rewards')
state['weighted_reward'] = state['skipped_requests'] * -1.0 + state['off_schedule_trips'] * -4.0

In [236]:
state_raw = results['state'].copy()
# state_raw.groupby(['episode']).size().head() # confirm it's 65 control steps per episode
num_time_steps = 10_000
time_steps_per_episode = 65
num_episodes = int(num_time_steps/time_steps_per_episode)

In [237]:
num_episodes

153