# Random Forest

In [1]:
from baseline.random_forest import RandomForest
from tqdm import tqdm
import random
import numpy as np
import os
# 设置全局随机种子
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

train_file = "data/reference/replica-cambridge_trips.csv"
eval_file = 'data/eval/replica-cambridge_trips_eval.csv'

for i in tqdm(range(1,11)):
    num_sample = i*100
    RF_model = RandomForest(train_file=train_file,eval_file=eval_file,sample_num=num_sample,seed=seed)
    RF_model.train()
    topk_accuracies,overall_kl,overall_mae,kl_df = RF_model.evaluate()
    print("---"*40)

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:11<01:39, 11.05s/it]

Best parameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -3.4574
Model saved to models/random_forest/randomforest_100.joblib
Top 3 accuracy: 0.8140
Overall average KL divergence: 1.8162
Overall mean absolute error: 0.0923
------------------------------------------------------------------------------------------------------------------------


 20%|██        | 2/10 [00:20<01:22, 10.26s/it]

Best parameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -1.5612
Model saved to models/random_forest/randomforest_200.joblib
Top 3 accuracy: 0.8170
Overall average KL divergence: 1.0187
Overall mean absolute error: 0.0804
------------------------------------------------------------------------------------------------------------------------


 30%|███       | 3/10 [00:30<01:09,  9.90s/it]

Best parameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -1.3985
Model saved to models/random_forest/randomforest_300.joblib
Top 3 accuracy: 0.8215
Overall average KL divergence: 0.9804
Overall mean absolute error: 0.0673
------------------------------------------------------------------------------------------------------------------------


 40%|████      | 4/10 [00:37<00:53,  8.93s/it]

Best parameters: {'estimator__max_depth': 20, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -0.9694
Model saved to models/random_forest/randomforest_400.joblib
Top 3 accuracy: 0.8280
Overall average KL divergence: 0.9671
Overall mean absolute error: 0.0706
------------------------------------------------------------------------------------------------------------------------


 50%|█████     | 5/10 [00:47<00:45,  9.09s/it]

Best parameters: {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -0.9437
Model saved to models/random_forest/randomforest_500.joblib
Top 3 accuracy: 0.8860
Overall average KL divergence: 1.3274
Overall mean absolute error: 0.0793
------------------------------------------------------------------------------------------------------------------------


 60%|██████    | 6/10 [00:54<00:34,  8.57s/it]

Best parameters: {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -0.7133
Model saved to models/random_forest/randomforest_600.joblib
Top 3 accuracy: 0.8815
Overall average KL divergence: 1.1899
Overall mean absolute error: 0.0832
------------------------------------------------------------------------------------------------------------------------
Best parameters: {'estimator__max_depth': 20, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 100}
Best cross-validation score: -0.6698
Model saved to models/random_forest/randomforest_700.joblib


 70%|███████   | 7/10 [01:06<00:29,  9.69s/it]

Top 3 accuracy: 0.9050
Overall average KL divergence: 1.0258
Overall mean absolute error: 0.0743
------------------------------------------------------------------------------------------------------------------------


 80%|████████  | 8/10 [01:15<00:18,  9.43s/it]

Best parameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -0.6211
Model saved to models/random_forest/randomforest_800.joblib
Top 3 accuracy: 0.8340
Overall average KL divergence: 0.8340
Overall mean absolute error: 0.0692
------------------------------------------------------------------------------------------------------------------------


 90%|█████████ | 9/10 [01:27<00:10, 10.10s/it]

Best parameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -0.5311
Model saved to models/random_forest/randomforest_900.joblib
Top 3 accuracy: 0.8330
Overall average KL divergence: 0.7124
Overall mean absolute error: 0.0684
------------------------------------------------------------------------------------------------------------------------


100%|██████████| 10/10 [01:36<00:00,  9.66s/it]

Best parameters: {'estimator__max_depth': 20, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 10}
Best cross-validation score: -0.5646
Model saved to models/random_forest/randomforest_1000.joblib
Top 3 accuracy: 0.8330
Overall average KL divergence: 0.7223
Overall mean absolute error: 0.0628
------------------------------------------------------------------------------------------------------------------------





# XGBoost

In [2]:
from baseline.xgboost import XGBoost
from tqdm import tqdm
import random
import numpy as np
import os
# 设置全局随机种子
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)  # 确保哈希随机性也被控制

train_file = "data/reference/replica-cambridge_trips.csv"
eval_file = 'data/eval/replica-cambridge_trips_eval.csv'

for i in tqdm(range(1,11)):
    num_sample = i*100
    XGB_model = XGBoost(train_file=train_file,eval_file=eval_file,sample_num=num_sample,seed=seed)
    XGB_model.train()
    topk_accuracies,overall_kl,overall_mape,kl_df = XGB_model.evaluate()
    print("---"*20)

  0%|          | 0/10 [00:00<?, ?it/s]

Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 0.001, 'estimator__max_depth': None, 'estimator__n_estimators': 10, 'estimator__subsample': 0.9}
Best cross-validation score: -3.5852
Model saved to models/xgboost/xgboost_100.joblib


 10%|█         | 1/10 [00:04<00:36,  4.02s/it]

Top 3 accuracy: 0.8500
Overall average KL divergence: 2.3192
Overall mean absolute error: 0.1018
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 1e-05, 'estimator__max_depth': None, 'estimator__n_estimators': 10, 'estimator__subsample': 0.9}
Best cross-validation score: -2.5917


 20%|██        | 2/10 [00:08<00:36,  4.53s/it]

Model saved to models/xgboost/xgboost_200.joblib
Top 3 accuracy: 0.8825
Overall average KL divergence: 2.4230
Overall mean absolute error: 0.0875
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 1e-05, 'estimator__max_depth': 10, 'estimator__n_estimators': 10, 'estimator__subsample': 0.9}
Best cross-validation score: -1.9420


 30%|███       | 3/10 [00:14<00:35,  5.10s/it]

Model saved to models/xgboost/xgboost_300.joblib
Top 3 accuracy: 0.8810
Overall average KL divergence: 1.5823
Overall mean absolute error: 0.0802
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 0.001, 'estimator__max_depth': 20, 'estimator__n_estimators': 10, 'estimator__subsample': 0.9}
Best cross-validation score: -1.5901


 40%|████      | 4/10 [00:21<00:34,  5.82s/it]

Model saved to models/xgboost/xgboost_400.joblib
Top 3 accuracy: 0.8810
Overall average KL divergence: 1.9369
Overall mean absolute error: 0.0866
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0.2, 'estimator__learning_rate': 0.001, 'estimator__max_depth': 20, 'estimator__n_estimators': 50, 'estimator__subsample': 0.9}
Best cross-validation score: -2.0379


 50%|█████     | 5/10 [00:31<00:37,  7.40s/it]

Model saved to models/xgboost/xgboost_500.joblib
Top 3 accuracy: 0.9035
Overall average KL divergence: 1.9673
Overall mean absolute error: 0.0918
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 0.001, 'estimator__max_depth': 20, 'estimator__n_estimators': 50, 'estimator__subsample': 0.9}
Best cross-validation score: -1.3760


 60%|██████    | 6/10 [00:42<00:34,  8.66s/it]

Model saved to models/xgboost/xgboost_600.joblib
Top 3 accuracy: 0.9005
Overall average KL divergence: 1.6758
Overall mean absolute error: 0.0888
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 0.001, 'estimator__max_depth': 20, 'estimator__n_estimators': 50, 'estimator__subsample': 0.9}
Best cross-validation score: -1.0462


 70%|███████   | 7/10 [00:54<00:29,  9.69s/it]

Model saved to models/xgboost/xgboost_700.joblib
Top 3 accuracy: 0.8995
Overall average KL divergence: 1.5925
Overall mean absolute error: 0.0867
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 1e-05, 'estimator__max_depth': 20, 'estimator__n_estimators': 10, 'estimator__subsample': 0.9}
Best cross-validation score: -1.1585


 80%|████████  | 8/10 [01:04<00:19,  9.63s/it]

Model saved to models/xgboost/xgboost_800.joblib
Top 3 accuracy: 0.9035
Overall average KL divergence: 1.5518
Overall mean absolute error: 0.0810
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 0.001, 'estimator__max_depth': 20, 'estimator__n_estimators': 10, 'estimator__subsample': 0.9}
Best cross-validation score: -0.7088


 90%|█████████ | 9/10 [01:14<00:09,  9.82s/it]

Model saved to models/xgboost/xgboost_900.joblib
Top 3 accuracy: 0.8985
Overall average KL divergence: 1.5888
Overall mean absolute error: 0.0876
------------------------------------------------------------
Best parameters: {'estimator__gamma': 0, 'estimator__learning_rate': 0.001, 'estimator__max_depth': 20, 'estimator__n_estimators': 100, 'estimator__subsample': 0.9}
Best cross-validation score: -1.1335


100%|██████████| 10/10 [01:33<00:00,  9.34s/it]

Model saved to models/xgboost/xgboost_1000.joblib
Top 3 accuracy: 0.9075
Overall average KL divergence: 1.6457
Overall mean absolute error: 0.0917
------------------------------------------------------------





# MLP

In [None]:
from baseline.multilayer_perceptron import MultilayerPerceptron
from tqdm import tqdm
import random
import numpy as np
import os
# 设置全局随机种子
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)  # 确保哈希随机性也被控制

train_file = "data/reference/replica-cambridge_trips.csv"
eval_file = 'data/eval/replica-cambridge_trips_eval.csv'

# for i in tqdm(range(1,11)):
#     num_sample = i*100
#     MLP_model = MultilayerPerceptron(train_file=train_file,eval_file=eval_file,sample_num=num_sample,seed=seed)
#     MLP_model.train()
#     topk_accuracies,overall_kl,overall_mape,kl_df = MLP_model.evaluate()
#     print("---"*20)


num_sample = 1000
MLP_model = MultilayerPerceptron(train_file=train_file,eval_file=eval_file,sample_num=num_sample,seed=seed)
MLP_model.train()
topk_accuracies,overall_kl,overall_mape,kl_df = MLP_model.evaluate()
print("---"*20)