In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import optuna
from optuna.visualization import plot_contour
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('df_train.csv', index_col=0)

In [3]:
df_train.head()

Unnamed: 0_level_0,route_distance_km,delta_time,is_afternoon,is_evening,is_morning,is_night,morning_peak_time,evening_peak_time,temperature_2m (°C),relativehumidity_2m (%),...,center_cluster_22,center_cluster_23,center_cluster_24,center_cluster_25,center_cluster_26,center_cluster_27,center_cluster_28,center_cluster_29,hour,pred_time
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-4773019581999572651,3.74,862.0,0,1,0,0,0,1,-3.1,83,...,0.000352,0.000223,0.000361,0.003389,0.001009,0.965341,0.000146,0.000343,18,445.785429
-7575630690398473489,3.526,753.0,0,0,1,0,0,0,-4.5,80,...,0.001438,0.000926,0.001176,0.006062,0.00263,0.0283,0.000603,0.002082,6,339.653051
-6264582368520213833,5.071,899.0,0,0,1,0,0,0,-2.8,73,...,0.003604,0.002392,0.00386,0.025585,0.0097,0.2321,0.001619,0.003763,10,555.16452
5964315354301636538,2.867,423.0,1,0,0,0,0,0,-2.6,74,...,0.000958,0.000814,0.001092,0.001724,0.001445,0.002135,0.000686,0.001092,14,327.574647
1372379574816145639,3.751,540.0,0,0,1,0,0,0,-3.1,74,...,0.003671,0.002517,0.001294,0.002202,0.001769,0.002392,0.001492,0.896434,11,410.407676


In [4]:
y = df_train['delta_time']
X = df_train.drop('delta_time', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [27]:
df_test = pd.read_csv('df_test.csv', index_col=0)
df_test.head()

Unnamed: 0_level_0,route_distance_km,is_afternoon,is_evening,is_morning,is_night,morning_peak_time,evening_peak_time,temperature_2m (°C),relativehumidity_2m (%),apparent_temperature (°C),...,center_cluster_22,center_cluster_23,center_cluster_24,center_cluster_25,center_cluster_26,center_cluster_27,center_cluster_28,center_cluster_29,hour,pred_time
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6198,4.744,0,0,0,1,0,0,-3.3,85,-9.6,...,0.045085,0.019603,0.02487,0.101205,0.06911,0.046614,0.009976,0.018058,3,391.092968
6417,6.279,0,0,0,1,0,0,-3.3,85,-9.6,...,0.010368,0.005796,0.011196,0.590644,0.052,0.07547,0.003472,0.006112,3,524.81971
7054,3.934,0,0,0,1,0,0,-3.3,85,-9.6,...,0.005112,0.003165,0.003691,0.018325,0.008171,0.056137,0.001991,0.008026,3,387.661475
9628,5.959,0,0,0,1,0,0,-3.6,84,-10.3,...,0.546919,0.073559,0.018087,0.009697,0.014032,0.005721,0.017462,0.006324,4,561.546514
10283,7.028,0,0,0,1,0,0,-3.6,84,-10.3,...,0.326604,0.063083,0.032142,0.017629,0.026947,0.009693,0.019732,0.009098,4,654.7533


# XGBoost

In [9]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
    }
    model = XGBRegressor(random_state=17, **param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

In [10]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

[32m[I 2023-03-05 09:25:38,347][0m A new study created in memory with name: regression[0m
[32m[I 2023-03-05 09:25:49,979][0m Trial 0 finished with value: 18435.112238059617 and parameters: {'max_depth': 10, 'learning_rate': 0.21589304811095197, 'n_estimators': 581, 'min_child_weight': 1, 'gamma': 0.7544808212624217, 'subsample': 0.4585877197385389, 'colsample_bytree': 0.46160038071506465, 'reg_alpha': 0.22729863981603193, 'reg_lambda': 0.04383732253185338}. Best is trial 0 with value: 18435.112238059617.[0m
[32m[I 2023-03-05 09:25:53,790][0m Trial 1 finished with value: 63607.17210388375 and parameters: {'max_depth': 4, 'learning_rate': 0.8778066251397444, 'n_estimators': 405, 'min_child_weight': 9, 'gamma': 0.6522447992664495, 'subsample': 0.45623574429873265, 'colsample_bytree': 0.5946318376852725, 'reg_alpha': 0.18187441775249596, 'reg_lambda': 0.6309652644366197}. Best is trial 0 with value: 18435.112238059617.[0m
[32m[I 2023-03-05 09:26:01,407][0m Trial 2 finished with 

In [11]:
model = XGBRegressor(**study.best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE: ', mean_absolute_error(y_test, y_pred))

MSE:  13463.47172018519
RMSE:  116.03220122097655
MAE:  90.10491123962403


In [12]:
model = XGBRegressor(**study.best_params)
model.fit(X, y)

In [30]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,Id,running_time,route_distance_km
0,4567450496676323264,2022-01-24 00:32:37,5.533
1,-1762687449608244695,2022-01-24 00:40:45,4.448
2,558209096572647780,2022-01-24 00:47:12,2.751
3,1091960336272216078,2022-01-24 01:03:47,4.435
4,-1321994599972009663,2022-01-24 01:07:56,2.394


In [29]:
df_test['Predicted'] = model.predict(df_test)

In [31]:
test['Predicted'] = test['Id'].map(df_test['Predicted'])
# test['Predicted'] = test['Predicted'].fillna(test['Predicted'].mean())
test['Id'] = test['Id'].astype(np.int32)

In [32]:
result = test[['Id', 'Predicted']]
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1000 non-null   int32  
 1   Predicted  0 non-null      float32
dtypes: float32(1), int32(1)
memory usage: 7.9 KB


In [37]:
df_test['Predicted'].to_csv('result.csv')

In [34]:
df_test.index = df_test.index.astype(np.int32)

In [35]:
df_test['Predicted']

Id
6198      603.538879
6417      696.316284
7054      573.016846
9628      736.266785
10283     815.242554
             ...    
525706    500.496582
526604    555.958435
527213    624.596558
527520    273.366669
527850    894.976196
Name: Predicted, Length: 1000, dtype: float32

In [None]:
plot_optimization_history(study)

In [None]:
plot_intermediate_values(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_contour(study)

In [None]:
plot_slice(study)