In [2]:
!pip install pandas numpy lightgbm optuna scikit-learn
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import datetime as dt

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [9]:
train = pd.read_csv('train.csv', parse_dates=['doj'])
test = pd.read_csv('test.csv', parse_dates=['doj'])
transactions = pd.read_csv('transactions.csv', parse_dates=['doj', 'doi'])

In [10]:
trans_15 = transactions[transactions['dbd'] == 15].copy()

In [12]:
features = ['doj', 'srcid', 'destid', 'cumsum_seatcount', 'cumsum_searchcount',
            'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']

trans_15 = trans_15[features]

train_merged = train.merge(trans_15, on=['doj', 'srcid', 'destid'], how='left')
test_merged = test.merge(trans_15, on=['doj', 'srcid', 'destid'], how='left')

In [13]:
def add_date_features(df):
    df['dayofweek'] = df['doj'].dt.dayofweek
    df['month'] = df['doj'].dt.month
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    return df

train_merged = add_date_features(train_merged)
test_merged = add_date_features(test_merged)

train_merged['search_to_book'] = train_merged['cumsum_searchcount'] / (train_merged['cumsum_seatcount'] + 1)
test_merged['search_to_book'] = test_merged['cumsum_searchcount'] / (test_merged['cumsum_seatcount'] + 1)

for col in ['cumsum_seatcount', 'cumsum_searchcount', 'search_to_book']:
    train_merged[col] = train_merged[col].fillna(0)
    test_merged[col] = test_merged[col].fillna(0)

cat_cols = ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    train_merged[col] = le.fit_transform(train_merged[col].astype(str))
    test_merged[col] = le.transform(test_merged[col].astype(str))
    le_dict[col] = le

In [14]:
features = ['srcid', 'destid', 'cumsum_seatcount', 'cumsum_searchcount', 'search_to_book',
            'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier', 'dayofweek', 'month', 'is_weekend']

target = 'final_seatcount'
X = train_merged[features]
y = train_merged[target]
X_test = test_merged[features]

In [17]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 100)
    }

    cv = GroupKFold(n_splits=5)
    oof_preds = np.zeros(X.shape[0])
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=train_merged['doj'])):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(**params, n_estimators=1000)
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])

        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration_)

    rmse = np.sqrt(mean_squared_error(y, oof_preds))
    print(f"Trial RMSE: {rmse:.4f}")
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2025-06-18 13:46:32,726] A new study created in memory with name: no-name-5e3cdc0c-7832-472f-9263-1b5cd562d06a
[I 2025-06-18 13:46:50,850] Trial 0 finished with value: 453.4814659835137 and parameters: {'learning_rate': 0.13722548128787346, 'num_leaves': 84, 'feature_fraction': 0.7673518966985811, 'bagging_fraction': 0.7492962233045292, 'bagging_freq': 5, 'min_data_in_leaf': 75}. Best is trial 0 with value: 453.4814659835137.


Trial RMSE: 453.4815


[I 2025-06-18 13:47:17,524] Trial 1 finished with value: 460.5655367650035 and parameters: {'learning_rate': 0.10518138278412673, 'num_leaves': 228, 'feature_fraction': 0.8415538702561272, 'bagging_fraction': 0.8101408619500473, 'bagging_freq': 5, 'min_data_in_leaf': 90}. Best is trial 0 with value: 453.4814659835137.


Trial RMSE: 460.5655


[I 2025-06-18 13:48:01,255] Trial 2 finished with value: 446.2120118492755 and parameters: {'learning_rate': 0.05211071044675759, 'num_leaves': 34, 'feature_fraction': 0.7786371785582827, 'bagging_fraction': 0.8026477662267294, 'bagging_freq': 10, 'min_data_in_leaf': 83}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 446.2120


[I 2025-06-18 13:48:27,665] Trial 3 finished with value: 453.32952491046063 and parameters: {'learning_rate': 0.07400757470412092, 'num_leaves': 228, 'feature_fraction': 0.7590649417595074, 'bagging_fraction': 0.9416592187747532, 'bagging_freq': 9, 'min_data_in_leaf': 66}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 453.3295


[I 2025-06-18 13:48:54,466] Trial 4 finished with value: 451.5569488835897 and parameters: {'learning_rate': 0.11670086615480346, 'num_leaves': 65, 'feature_fraction': 0.8802338865804542, 'bagging_fraction': 0.9814282986140785, 'bagging_freq': 6, 'min_data_in_leaf': 93}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 451.5569


[I 2025-06-18 13:49:20,836] Trial 5 finished with value: 452.24831124795514 and parameters: {'learning_rate': 0.09653064007564158, 'num_leaves': 78, 'feature_fraction': 0.9867284369951126, 'bagging_fraction': 0.8886544186567558, 'bagging_freq': 7, 'min_data_in_leaf': 85}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 452.2483


[I 2025-06-18 13:49:36,118] Trial 6 finished with value: 446.8893517231865 and parameters: {'learning_rate': 0.1703735388002166, 'num_leaves': 32, 'feature_fraction': 0.914329875691831, 'bagging_fraction': 0.9526987412445438, 'bagging_freq': 5, 'min_data_in_leaf': 61}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 446.8894


[I 2025-06-18 13:49:56,992] Trial 7 finished with value: 449.9915456163968 and parameters: {'learning_rate': 0.09052545883156868, 'num_leaves': 89, 'feature_fraction': 0.7897194240041641, 'bagging_fraction': 0.7701400997311039, 'bagging_freq': 7, 'min_data_in_leaf': 61}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 449.9915


[I 2025-06-18 13:50:05,689] Trial 8 finished with value: 454.4773641472358 and parameters: {'learning_rate': 0.18933435403832363, 'num_leaves': 139, 'feature_fraction': 0.7916742863294943, 'bagging_fraction': 0.8357676867778947, 'bagging_freq': 6, 'min_data_in_leaf': 20}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 454.4774


[I 2025-06-18 13:50:33,310] Trial 9 finished with value: 456.9081813127695 and parameters: {'learning_rate': 0.08527129662347821, 'num_leaves': 232, 'feature_fraction': 0.7657073831133567, 'bagging_fraction': 0.708686170784714, 'bagging_freq': 3, 'min_data_in_leaf': 72}. Best is trial 2 with value: 446.2120118492755.


Trial RMSE: 456.9082


[I 2025-06-18 13:51:51,545] Trial 10 finished with value: 444.0973623371996 and parameters: {'learning_rate': 0.017022738585740882, 'num_leaves': 161, 'feature_fraction': 0.7157517002179341, 'bagging_fraction': 0.8847604898689406, 'bagging_freq': 10, 'min_data_in_leaf': 41}. Best is trial 10 with value: 444.0973623371996.


Trial RMSE: 444.0974


[I 2025-06-18 13:53:17,554] Trial 11 finished with value: 443.5412603021604 and parameters: {'learning_rate': 0.01346328516615344, 'num_leaves': 175, 'feature_fraction': 0.7024848803486391, 'bagging_fraction': 0.8834302235683357, 'bagging_freq': 10, 'min_data_in_leaf': 39}. Best is trial 11 with value: 443.5412603021604.


Trial RMSE: 443.5413


[I 2025-06-18 13:54:50,733] Trial 12 finished with value: 443.8192977604172 and parameters: {'learning_rate': 0.011008363464232018, 'num_leaves': 173, 'feature_fraction': 0.7037729961881257, 'bagging_fraction': 0.8877771330332833, 'bagging_freq': 10, 'min_data_in_leaf': 39}. Best is trial 11 with value: 443.5412603021604.


Trial RMSE: 443.8193


[I 2025-06-18 13:56:03,132] Trial 13 finished with value: 445.49955820429454 and parameters: {'learning_rate': 0.014778069874288851, 'num_leaves': 176, 'feature_fraction': 0.8305808893811844, 'bagging_fraction': 0.887907308837241, 'bagging_freq': 1, 'min_data_in_leaf': 42}. Best is trial 11 with value: 443.5412603021604.


Trial RMSE: 445.4996


[I 2025-06-18 13:56:32,469] Trial 14 finished with value: 444.46466853848585 and parameters: {'learning_rate': 0.046283537738395206, 'num_leaves': 191, 'feature_fraction': 0.7020846309654244, 'bagging_fraction': 0.9116687219069126, 'bagging_freq': 8, 'min_data_in_leaf': 42}. Best is trial 11 with value: 443.5412603021604.


Trial RMSE: 444.4647


[I 2025-06-18 13:57:03,680] Trial 15 finished with value: 441.81727202742036 and parameters: {'learning_rate': 0.039309340296804024, 'num_leaves': 129, 'feature_fraction': 0.7293123039886377, 'bagging_fraction': 0.8567775574601856, 'bagging_freq': 9, 'min_data_in_leaf': 26}. Best is trial 15 with value: 441.81727202742036.


Trial RMSE: 441.8173


[I 2025-06-18 13:57:32,303] Trial 16 finished with value: 441.9003865042739 and parameters: {'learning_rate': 0.04230542164456722, 'num_leaves': 125, 'feature_fraction': 0.7302060144153768, 'bagging_fraction': 0.848064681720057, 'bagging_freq': 8, 'min_data_in_leaf': 20}. Best is trial 15 with value: 441.81727202742036.


Trial RMSE: 441.9004


[I 2025-06-18 13:57:58,514] Trial 17 finished with value: 441.71885713741716 and parameters: {'learning_rate': 0.04610864423645824, 'num_leaves': 128, 'feature_fraction': 0.734723742385569, 'bagging_fraction': 0.8395581112710775, 'bagging_freq': 8, 'min_data_in_leaf': 21}. Best is trial 17 with value: 441.71885713741716.


Trial RMSE: 441.7189


[I 2025-06-18 13:58:20,920] Trial 18 finished with value: 445.66372394192615 and parameters: {'learning_rate': 0.06476926274513242, 'num_leaves': 115, 'feature_fraction': 0.811197589971685, 'bagging_fraction': 0.815050838302152, 'bagging_freq': 8, 'min_data_in_leaf': 30}. Best is trial 17 with value: 441.71885713741716.


Trial RMSE: 445.6637


[I 2025-06-18 13:59:08,221] Trial 19 finished with value: 446.40713194295444 and parameters: {'learning_rate': 0.03472656315553155, 'num_leaves': 115, 'feature_fraction': 0.7380770665979183, 'bagging_fraction': 0.7674473709196906, 'bagging_freq': 9, 'min_data_in_leaf': 53}. Best is trial 17 with value: 441.71885713741716.


Trial RMSE: 446.4071


[I 2025-06-18 13:59:21,693] Trial 20 finished with value: 453.2737318886199 and parameters: {'learning_rate': 0.13322616681540708, 'num_leaves': 197, 'feature_fraction': 0.8752814627607195, 'bagging_fraction': 0.8485620830369931, 'bagging_freq': 7, 'min_data_in_leaf': 26}. Best is trial 17 with value: 441.71885713741716.


Trial RMSE: 453.2737


[I 2025-06-18 13:59:57,415] Trial 21 finished with value: 441.46127397810375 and parameters: {'learning_rate': 0.03550693558050911, 'num_leaves': 131, 'feature_fraction': 0.7381739363521873, 'bagging_fraction': 0.8524166599608531, 'bagging_freq': 8, 'min_data_in_leaf': 22}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 441.4613


[I 2025-06-18 14:00:35,098] Trial 22 finished with value: 442.37107108150366 and parameters: {'learning_rate': 0.033318993817338334, 'num_leaves': 147, 'feature_fraction': 0.7440222912555192, 'bagging_fraction': 0.9189528205833566, 'bagging_freq': 9, 'min_data_in_leaf': 31}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 442.3711


[I 2025-06-18 14:00:58,450] Trial 23 finished with value: 443.51673180184616 and parameters: {'learning_rate': 0.06233658288793828, 'num_leaves': 108, 'feature_fraction': 0.7372385566632457, 'bagging_fraction': 0.829254171141295, 'bagging_freq': 8, 'min_data_in_leaf': 28}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 443.5167


[I 2025-06-18 14:01:43,782] Trial 24 finished with value: 447.3714497273731 and parameters: {'learning_rate': 0.029162243320155684, 'num_leaves': 138, 'feature_fraction': 0.8095461754646324, 'bagging_fraction': 0.864150837772638, 'bagging_freq': 9, 'min_data_in_leaf': 53}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 447.3714


[I 2025-06-18 14:02:09,303] Trial 25 finished with value: 446.23753815874903 and parameters: {'learning_rate': 0.05813390477428908, 'num_leaves': 101, 'feature_fraction': 0.9433314811170783, 'bagging_fraction': 0.7880937445391396, 'bagging_freq': 4, 'min_data_in_leaf': 34}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 446.2375


[I 2025-06-18 14:02:26,297] Trial 26 finished with value: 445.6990439578973 and parameters: {'learning_rate': 0.07807095192527924, 'num_leaves': 151, 'feature_fraction': 0.7504351927423958, 'bagging_fraction': 0.8622679566749339, 'bagging_freq': 7, 'min_data_in_leaf': 24}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 445.6990


[I 2025-06-18 14:03:18,732] Trial 27 finished with value: 447.76347276628144 and parameters: {'learning_rate': 0.02845517579625884, 'num_leaves': 256, 'feature_fraction': 0.7280886334489655, 'bagging_fraction': 0.9199680289479817, 'bagging_freq': 8, 'min_data_in_leaf': 48}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 447.7635


[I 2025-06-18 14:03:39,976] Trial 28 finished with value: 446.85834590572136 and parameters: {'learning_rate': 0.07055702713133924, 'num_leaves': 128, 'feature_fraction': 0.8048577340774336, 'bagging_fraction': 0.8259916992935493, 'bagging_freq': 6, 'min_data_in_leaf': 34}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 446.8583


[I 2025-06-18 14:04:10,347] Trial 29 finished with value: 441.7966215664142 and parameters: {'learning_rate': 0.04589505767842613, 'num_leaves': 92, 'feature_fraction': 0.7709512368818159, 'bagging_fraction': 0.790705369981963, 'bagging_freq': 3, 'min_data_in_leaf': 24}. Best is trial 21 with value: 441.46127397810375.


Trial RMSE: 441.7966


In [18]:
best_params = study.best_trial.params
best_params.update({"objective": "regression", "metric": "rmse", "verbosity": -1})

final_model = lgb.LGBMRegressor(**best_params, n_estimators=1000)
final_model.fit(X, y)

In [19]:
test_preds = final_model.predict(X_test)

submission = pd.DataFrame({
    'route_key': test['route_key'],
    'final_seatcount': test_preds
})

submission.to_csv('submission_file.csv', index=False)
print("Submission saved as submission_file.csv")

Submission saved as submission_file.csv


In [21]:
oof_preds = final_model.predict(X)
total_rmse = np.sqrt(mean_squared_error(y, oof_preds))
print(f"Final RMSE on full training data: {total_rmse:.4f}")


Final RMSE on full training data: 307.2156


In [22]:
from google.colab import files
files.download('submission_file.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>