# Mode Prediction Experiment

This experiment simulate the mode choice of 1000 agents in Cambridge and compare them with real data

### Get Evaluation Data

In [2]:
from baseline.data import load_data
import random
import numpy as np
import os
# 设置全局随机种子
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)  # 确保哈希随机性也被控制

eval_file = 'data/eval/replica-cambridge_trips_eval.csv'
eval_df = load_data(eval_file)
eval_df.head(2)

Unnamed: 0,person_id,age,gender,employment_status,household_size,household_income,available_vehicles,industry,education,trip_purpose,start_time,primary_mode,duration_minutes,age_group,income_group
0,14941376504966255761,16,male,not_in_labor_force,4,140343,two,not_working,k_12,eat,14,walking,10-20,Under 18,$100k-$150k
1,741506727884677094,60,male,employed,1,37850,one,naics445110,bachelors_degree,eat,14,walking,50-60,55-64,$10k-$50k


###  Run experiments

In [3]:
import json
import os
import random
from tqdm import tqdm
import pandas as pd
import concurrent.futures
from functools import partial

from mobility_agent.agent import MobilityAgent
from baseline.eval import cal_group_kl_divergence

def process_row(row, num_samples, profile_columns,city):
    try:
        idx = row.name
        profile = row[profile_columns].to_dict()
        desire = row['trip_purpose']
        time = row['start_time']
        
        agent = MobilityAgent(profile=json.dumps(profile), sample_num=num_samples,city=city)
        agent.working_memory = ["Today is a normal weekday"]
        
        if num_samples == 0:
            mode_preference = agent.get_mode_prefernce(desire=desire, time=time, use_reference=False)
        else:
            mode_preference = agent.get_mode_prefernce(desire=desire, time=time, use_reference=True)
        
        choice_weights = mode_preference['choice_weights']
        modes = [choice['primary_mode'] for choice in choice_weights]
        weights = [choice['weight'] for choice in choice_weights]
        selected_mode_idx = random.choices(range(len(modes)), weights=weights, k=1)[0]
        selected_mode = choice_weights[selected_mode_idx]
        
        return {
            'idx': idx,
            'predicted_mode': selected_mode['primary_mode'],
            'predicted_duration': selected_mode['duration_minutes'],
            'selection_reason': json.dumps(mode_preference['think']),
            'choice_weights': json.dumps(choice_weights)
        }
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        return None

def run_experiments(eval_df,max_workers,num_samples,save_path,city='Cambridge,MA'):
    # Set up parameters
    profile_columns = ['age_group', 'income_group', 'employment_status', 'household_size', 'available_vehicles', 'education']
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    print(f"=======Mobility Agent (num_samples={num_samples})=======")

    # Create a partial function with fixed parameters
    process_row_partial = partial(process_row, num_samples=num_samples, profile_columns=profile_columns,city=city)

    # Process rows in parallel
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all jobs
        future_to_idx = {executor.submit(process_row_partial, row): idx 
                        for idx, row in eval_df.iterrows()}
        
        # Process results as they complete with a progress bar
        for future in tqdm(concurrent.futures.as_completed(future_to_idx), total=len(eval_df)):
            result = future.result()
            if result:
                results.append(result)
                
            # Save intermediate results every 100 completed rows
            if len(results) % 100 == 0:
                # Update the dataframe with results so far
                temp_df = eval_df.copy()
                for res in results:
                    idx = res['idx']
                    temp_df.loc[idx, "predicted_mode"] = res['predicted_mode']
                    temp_df.loc[idx, "predicted_duration"] = res['predicted_duration']
                    temp_df.loc[idx, "selection_reason"] = res['selection_reason']
                    temp_df.loc[idx, "choice_weights"] = res['choice_weights']
                temp_df.to_csv(save_path)

    # Update final dataframe with all results
    for res in results:
        idx = res['idx']
        eval_df.loc[idx, "predicted_mode"] = res['predicted_mode']
        eval_df.loc[idx, "predicted_duration"] = res['predicted_duration']
        eval_df.loc[idx, "selection_reason"] = res['selection_reason']
        eval_df.loc[idx, "choice_weights"] = res['choice_weights']

    # Save final results
    eval_df.to_csv(save_path)

    print(f"=======Evaluating model=======")
    result_df = eval_df.copy()
    kl_df, overall_kl, overall_mae = cal_group_kl_divergence(result_df=result_df)
    print(f"Overall average KL divergence: {overall_kl:.4f}")
    print(f"Overall mean absolute error: {overall_mae:.4f}")

### Get Results from local LLM

In [3]:
# # Change BASEURL and APIKEY in .env to use ollama
# l1 = [i*10 for i in range(11)]
# l2 = [i*100 for i in range(2,11)]
# all_samples = l1 + l2
# max_workers = min(os.cpu_count(), 8) 

# for num_samples in [800,900,1000]:
#     for i in range(3):
#         save_path = f"models/mobility_agent/mobility_agent_sim{i}_{num_samples}.csv"
#         run_experiments(eval_df,max_workers=max_workers,num_samples=num_samples,save_path=save_path)

### Get Results from OpenAI

In [4]:
# Change BASEURL and APIKEY in .env to use openai api
# l1 = [i*10 for i in range(11)]
# l2 = [i*100 for i in range(2,11)]
# all_samples = l1 + l2

all_samples = [i*100 for i in range(3,11)]
max_workers = min(os.cpu_count(), 8) 

for num_samples in all_samples:
    save_path = f"models/mobility_agent_gpt/mobility_agent_{num_samples}.csv"
    run_experiments(eval_df,max_workers=max_workers,num_samples=num_samples,save_path=save_path)



100%|██████████| 1000/1000 [35:04<00:00,  2.10s/it] 


Overall average KL divergence: 0.5823
Overall mean absolute error: 0.0840


100%|██████████| 1000/1000 [35:59<00:00,  2.16s/it] 


Overall average KL divergence: 0.6000
Overall mean absolute error: 0.0730


100%|██████████| 1000/1000 [35:08<00:00,  2.11s/it] 


Overall average KL divergence: 0.5897
Overall mean absolute error: 0.0697


 50%|████▉     | 497/1000 [17:52<37:31,  4.48s/it]  

Connection error.
Error processing row 489: 'NoneType' object is not subscriptable


 50%|████▉     | 499/1000 [17:53<20:33,  2.46s/it]

Connection error.
Error processing row 490: 'NoneType' object is not subscriptable


 50%|█████     | 502/1000 [17:56<12:54,  1.55s/it]

Connection error.
Error processing row 494: 'NoneType' object is not subscriptable


100%|██████████| 1000/1000 [34:38<00:00,  2.08s/it]


Overall average KL divergence: 0.6108
Overall mean absolute error: 0.0830


100%|██████████| 1000/1000 [34:44<00:00,  2.08s/it] 


Overall average KL divergence: 0.5101
Overall mean absolute error: 0.0793


100%|██████████| 1000/1000 [35:54<00:00,  2.15s/it] 


Overall average KL divergence: 0.5462
Overall mean absolute error: 0.0713


100%|██████████| 1000/1000 [36:15<00:00,  2.18s/it] 


Overall average KL divergence: 0.4748
Overall mean absolute error: 0.0929


100%|██████████| 1000/1000 [37:06<00:00,  2.23s/it] 

Overall average KL divergence: 0.6999
Overall mean absolute error: 0.1082





### Cambridge refrence predict SF

In [None]:
from baseline.data import load_data
import random
import numpy as np
import os
# 设置全局随机种子
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)  # 确保哈希随机性也被控制

# Change eval data to SF
eval_file = 'data/eval/replica-sf_trips_eval.csv'
eval_df = load_data(eval_file)
eval_df.head(2)

Unnamed: 0,person_id,age,gender,employment_status,household_size,household_income,available_vehicles,industry,education,trip_purpose,start_time,primary_mode,duration_minutes,age_group,income_group
0,1008394524663602144,41,male,employed,5,165219,two,naics722511,k_12,work,8,public_transit,40-50,35-44,$150k-$200k
1,17686081048863139877,57,female,employed,2,195539,one,naics5615,bachelors_degree,eat,11,auto_passenger,10-20,55-64,$150k-$200k


In [None]:
max_workers = 4
num_samples = 1000
save_path = f"models/mobility_agent/mobility_agent_gpt_{num_samples}.csv"
# change city to SF
run_experiments(eval_df,max_workers=max_workers,num_samples=num_samples,save_path=save_path,city="San Francisco,CA")

### SF refrence predict SF

In [4]:
from baseline.data import load_data
ref_file = 'data/reference/replica-sf_trips.csv'
eval_file = 'data/eval/replica-sf_trips_eval.csv'
eval_df = load_data(eval_file)
eval_df.head(2)

Unnamed: 0,person_id,age,gender,employment_status,household_size,household_income,available_vehicles,industry,education,trip_purpose,start_time,primary_mode,duration_minutes,age_group,income_group
0,1008394524663602144,41,male,employed,5,165219,two,naics722511,k_12,work,8,public_transit,40-50,35-44,$150k-$200k
1,17686081048863139877,57,female,employed,2,195539,one,naics5615,bachelors_degree,eat,11,auto_passenger,10-20,55-64,$150k-$200k


In [None]:
# Change REFERENCE_CITY and TRIP_FILE in .env to change the reference data

max_workers = 4
num_samples = 100
save_path = f"models/mobility_agent/mobility_agent_sf_refsf_{num_samples}.csv"
# change city to SF
run_experiments(eval_df,max_workers=max_workers,num_samples=num_samples,save_path=save_path,city="San Francisco,CA")



 98%|█████████▊| 985/1000 [3:22:57<01:42,  6.80s/it]  

Error processing row 985: list index out of range


100%|██████████| 1000/1000 [3:25:28<00:00, 12.33s/it]

Overall average KL divergence: 0.7991
Overall mean absolute error: 0.0632



