In [1]:
import pandas as pd

# Load CSVs
train = pd.read_csv("train/train.csv", parse_dates=["doj"])
test = pd.read_csv("test.csv", parse_dates=["doj"])
transactions = pd.read_csv("train/transactions.csv", parse_dates=["doj", "doi"])

# Filter transactions for exactly 15 days before journey
transactions_15 = transactions[transactions["dbd"] == 15].copy()

# Aggregate features for each (doj, srcid, destid)
agg_15 = transactions_15.groupby(["doj", "srcid", "destid"]).agg({
    "cumsum_seatcount": "sum",
    "cumsum_searchcount": "sum",
    "srcid_region": "first",
    "destid_region": "first",
    "srcid_tier": "first",
    "destid_tier": "first"
}).reset_index()

# Rename columns for clarity
agg_15.rename(columns={
    "cumsum_seatcount": "cumsum_seatcount_15",
    "cumsum_searchcount": "cumsum_searchcount_15"
}, inplace=True)

# Merge with train and test
train_enriched = pd.merge(train, agg_15, on=["doj", "srcid", "destid"], how="left")
test_enriched = pd.merge(test, agg_15, on=["doj", "srcid", "destid"], how="left")


In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Drop missing values
train_clean = train_enriched.dropna()

# Separate features and target
X = train_clean.drop(columns=["final_seatcount"])
y = train_clean["final_seatcount"]

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define feature types
categorical_features = ["srcid_region", "destid_region", "srcid_tier", "destid_tier"]
numerical_features = ["srcid", "destid", "cumsum_seatcount_15", "cumsum_searchcount_15"]

# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Define the pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", lgb.LGBMRegressor(objective="regression", random_state=42))
])

# Define hyperparameter grid
param_grid = {
    "regressor__n_estimators": [100, 200],
    "regressor__learning_rate": [0.05, 0.1],
    "regressor__max_depth": [10, 20, -1]
}

# Grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="neg_root_mean_squared_error",
    verbose=1,
    n_jobs=-1
)

# Fit model
grid_search.fit(X_train, y_train)

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)

print("✅ Best Parameters:", grid_search.best_params_)
print("📉 Validation RMSE:", rmse)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total B



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000818 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000284 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002487 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002337 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.191685




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2000.468108




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35840, number of used features: 32
[LightGBM] [Info] Start training from score 2005.237807




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 32
[LightGBM] [Info] Start training from score 2003.632533
✅ Best Parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 20, 'regressor__n_estimators': 200}
📉 Validation RMSE: 287680.3864387683




In [3]:
# Drop any rows with missing values in test
test_ready = test_enriched.dropna()

# Predict using best model from GridSearchCV
test_preds = best_model.predict(test_ready)

# Build submission file
submission = pd.DataFrame({
    "route_key": test_ready["route_key"],
    "final_seatcount": test_preds.astype(int)  # rounding might help depending on leaderboard behavior
})

# Save to CSV
submission.to_csv("submission_file_gpt.csv", index=False)
print("✅ submission_file.csv saved!")


✅ submission_file.csv saved!


