# Mode Prediction Experiment

This experiment simulate the mode choice of 1000 agents in Cambridge and compare them with real data

### Get Evaluation Data

In [1]:
from baseline.data import load_data
eval_file = 'data/eval/replica-cambridge_trips_eval.csv'
eval_df = load_data(eval_file)
eval_df.head(2)

Unnamed: 0,person_id,age,gender,employment_status,household_size,household_income,available_vehicles,industry,education,trip_purpose,start_time,primary_mode,duration_minutes,age_group,income_group
0,14941376504966255761,16,male,not_in_labor_force,4,140343,two,not_working,k_12,eat,14,walking,10-20,Under 18,$100k-$150k
1,741506727884677094,60,male,employed,1,37850,one,naics445110,bachelors_degree,eat,14,walking,50-60,55-64,$10k-$50k


###  Get Results

In [2]:
import json
import os
import random
from tqdm import tqdm

from mobility_agent.agent import MobilityAgent
from baseline.eval import cal_group_kl_divergence,cal_topk_acc

profile_columns = ['age_group','income_group','employment_status','household_size','available_vehicles','education']

group_features = ['age_group','income_group', 'employment_status', 'household_size','available_vehicles', 'education', 'trip_purpose','start_time']

eval_results = {}
for i in range(1,11):
    num_samples = i*100
    print(f"=======Mobility Agent (num_samples={num_samples})=======")
    save_path = f"models/mobility_agent/mobility_agent_{num_samples}.csv"
    os.makedirs(os.path.dirname(save_path),exist_ok=True)
    for idx, row in tqdm(eval_df.iterrows(),total=len(eval_df)):
        try:
            profile = row[profile_columns].to_dict()
            desire  = row['trip_purpose']
            time = row['start_time']
            agent = MobilityAgent(profile=json.dumps(profile),sample_num=num_samples)
            agent.working_memory = ["Today is a normal weekday"]
            if num_samples == 0:
                mode_prefernce= agent.get_mode_prefernce_without_reference(desire=desire,time=time)
            else:
                mode_prefernce= agent.get_mode_prefernce(desire=desire,time=time)
            choice_weights = mode_prefernce['choice_weights']
            modes = [choice['primary_mode'] for choice in choice_weights]
            weights = [choice['weight'] for choice in choice_weights]
            selected_mode_idx = random.choices(range(len(modes)), weights=weights, k=1)[0]
            selected_mode = choice_weights[selected_mode_idx]
            eval_df.loc[idx,"predicted_mode"] = selected_mode['primary_mode']
            eval_df.loc[idx,"predicted_duration"] = selected_mode['duration_minutes']
            eval_df.loc[idx,"selection_reason"] = json.dumps(mode_prefernce['think'])
            eval_df.loc[idx,"choice_weights"] = json.dumps(choice_weights)
            if idx % 100 ==0:
                eval_df.to_csv(save_path)
        except Exception as e:
            print(e)
    eval_df.to_csv(save_path)
    print(f"=======Evaluating model=======")
    k = 3
    result_df = eval_df.copy()
    topk_accuracies = cal_topk_acc(result_df=result_df,k=k)
    kl_df, overall_kl,overall_mape = cal_group_kl_divergence(result_df=result_df,group_features=group_features)
    print(f"Top {k} accuracy: { topk_accuracies['average']:.4f}")
    print(f"Overall average KL divergence: {overall_kl:.4f}")
    print(f"Overall mean absolute percentage error: {overall_mape:.4f}")

  from .autonotebook import tqdm as notebook_tqdm




 67%|██████▋   | 672/1000 [53:24<22:32,  4.12s/it]  

list index out of range


 78%|███████▊  | 781/1000 [1:02:11<16:24,  4.49s/it]

list index out of range


100%|██████████| 1000/1000 [1:29:48<00:00,  5.39s/it]  


Top 3 accuracy: 0.8315
Overall average KL divergence: 0.9705
Overall mean absolute percentage error: 0.9118


 82%|████████▏ | 820/1000 [1:09:17<2:21:40, 47.23s/it]

list index out of range


100%|██████████| 1000/1000 [1:34:01<00:00,  5.64s/it]  


Top 3 accuracy: 0.8180
Overall average KL divergence: 0.5657
Overall mean absolute percentage error: 0.9293


100%|██████████| 1000/1000 [1:20:39<00:00,  4.84s/it]


Top 3 accuracy: 0.8075
Overall average KL divergence: 0.5186
Overall mean absolute percentage error: 0.9195


100%|██████████| 1000/1000 [1:20:39<00:00,  4.84s/it]


Top 3 accuracy: 0.8170
Overall average KL divergence: 0.5701
Overall mean absolute percentage error: 0.9649


100%|██████████| 1000/1000 [1:32:22<00:00,  5.54s/it] 


Top 3 accuracy: 0.8190
Overall average KL divergence: 0.5387
Overall mean absolute percentage error: 0.9772


100%|██████████| 1000/1000 [1:22:05<00:00,  4.93s/it]


Top 3 accuracy: 0.8265
Overall average KL divergence: 0.5395
Overall mean absolute percentage error: 0.9021


 52%|█████▏    | 517/1000 [41:53<35:53,  4.46s/it]  

list index out of range


100%|██████████| 1000/1000 [1:21:13<00:00,  4.87s/it]


Top 3 accuracy: 0.8015
Overall average KL divergence: 0.4380
Overall mean absolute percentage error: 0.9588


 16%|█▌        | 158/1000 [13:42<2:06:27,  9.01s/it]

1 validation error for TransportationChoice
  Invalid JSON: EOF while parsing a string at line 1 column 8297 [type=json_invalid, input_value='{"think": ["I am 45-54 y...formation. 😊</think>', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
'NoneType' object is not subscriptable


100%|██████████| 1000/1000 [1:34:04<00:00,  5.64s/it] 


Top 3 accuracy: 0.7805
Overall average KL divergence: 0.5633
Overall mean absolute percentage error: 0.9674


100%|██████████| 1000/1000 [1:41:34<00:00,  6.09s/it] 


Top 3 accuracy: 0.8100
Overall average KL divergence: 0.5008
Overall mean absolute percentage error: 0.9482


 97%|█████████▋| 966/1000 [1:18:39<02:18,  4.07s/it]

list index out of range


100%|██████████| 1000/1000 [1:21:27<00:00,  4.89s/it]

Top 3 accuracy: 0.7855
Overall average KL divergence: 0.5481
Overall mean absolute percentage error: 0.9672





In [4]:
import pandas as pd
# from mobility_agent.agent import MobilityAgent
from baseline.eval import cal_group_kl_divergence,cal_topk_acc

group_features = ['age_group','income_group', 'employment_status', 'household_size','available_vehicles', 'education', 'trip_purpose','start_time']

result_path = "models/mobility_agent/mobility_agent_0.csv"
result_df = pd.read_csv(result_path)
k = 3
topk_accuracies = cal_topk_acc(result_df=result_df,k=k)
kl_df, overall_kl,overall_mape = cal_group_kl_divergence(result_df=result_df,group_features=group_features)
print(f"Top {k} accuracy: { topk_accuracies['average']:.4f}")
print(f"Overall average KL divergence: {overall_kl:.4f}")
print(f"Overall mean absolute percentage error: {overall_mape:.4f}")

Top 3 accuracy: 0.4680
Overall average KL divergence: 1.3639
Overall mean absolute percentage error: 0.9989


### Visualization

### Comparision of Duration Choice Distribution

### Mode Prediction 2: Chaning Conditions