In [1]:
!pip install pandas numpy lightgbm xgboost catboost scikit-learn
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from datetime import datetime
import gc
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
# Load Data
train = pd.read_csv('Train.csv', parse_dates=['doj'])
test = pd.read_csv('Test.csv', parse_dates=['doj'])
transactions = pd.read_csv('Transactions.csv', parse_dates=['doj', 'doi'])

# Filter transactions with dbd < 15
transactions = transactions[transactions['dbd'] < 15]

# Merge transactions: add week/month columns
transactions['week'] = transactions['doj'].dt.isocalendar().week.astype(int)
transactions['month'] = transactions['doj'].dt.month

# Combine train and test
test['final_seatcount'] = np.nan
train['dataset'] = 'train'
test['dataset'] = 'test'
df = pd.concat([train, test]).reset_index(drop=True)

# Ensure df also has 'week' and 'month' columns
df['week'] = df['doj'].dt.isocalendar().week.astype(int)
df['month'] = df['doj'].dt.month

In [6]:
# Optimized multi-level aggregation function with batching
def multi_level_aggregations(trans, group_cols, agg_cols):
    agg_df_list = []
    for col in tqdm(agg_cols, desc="Processing agg_cols"):
        for group in tqdm(group_cols, desc=f"Groups for {col}"):
            group_name = '_'.join(group)
            agg = trans.groupby(group)[col].agg(['mean','sum','std','min','max']).astype(np.float32).reset_index()
            agg.columns = group + [f"{group_name}_{col}_{stat}" for stat in ['mean','sum','std','min','max']]
            agg_df_list.append((agg, group))
            gc.collect()
    return agg_df_list

group_cols = [['srcid'], ['destid'], ['srcid','destid'], ['srcid','week'], ['destid','month'], ['srcid','destid','week']]
agg_cols = ['cumsum_seatcount', 'cumsum_searchcount']
agg_dfs = multi_level_aggregations(transactions, group_cols, agg_cols)

# Merge all aggregated features safely using their specific group keys
tqdm.pandas(desc="Merging features")
for agg_df, merge_keys in tqdm(agg_dfs, desc="Merging features"):
    for key in merge_keys:
        if key not in df.columns:
            df[key] = np.nan  # Ensure keys exist in df to avoid merge error
    df = df.merge(agg_df, on=merge_keys, how='left')
    del agg_df
    gc.collect()

Processing agg_cols:   0%|          | 0/2 [00:00<?, ?it/s]
Groups for cumsum_seatcount:   0%|          | 0/6 [00:00<?, ?it/s][A
Groups for cumsum_seatcount:  17%|█▋        | 1/6 [00:00<00:03,  1.52it/s][A
Groups for cumsum_seatcount:  33%|███▎      | 2/6 [00:01<00:02,  1.99it/s][A
Groups for cumsum_seatcount:  50%|█████     | 3/6 [00:02<00:02,  1.42it/s][A
Groups for cumsum_seatcount:  67%|██████▋   | 4/6 [00:02<00:01,  1.56it/s][A
Groups for cumsum_seatcount:  83%|████████▎ | 5/6 [00:02<00:00,  1.81it/s][A
Groups for cumsum_seatcount: 100%|██████████| 6/6 [00:03<00:00,  1.82it/s]
Processing agg_cols:  50%|█████     | 1/2 [00:03<00:03,  3.30s/it]
Groups for cumsum_searchcount:   0%|          | 0/6 [00:00<?, ?it/s][A
Groups for cumsum_searchcount:  17%|█▋        | 1/6 [00:00<00:01,  3.56it/s][A
Groups for cumsum_searchcount:  33%|███▎      | 2/6 [00:00<00:01,  3.79it/s][A
Groups for cumsum_searchcount:  50%|█████     | 3/6 [00:00<00:00,  3.81it/s][A
Groups for cumsum_searchcou

In [7]:
# Add date features
df['dayofweek'] = df['doj'].dt.dayofweek
df['weekofyear'] = df['doj'].dt.isocalendar().week.astype(int)
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

# Fill missing required base features before generating lag features
for col in ['cumsum_seatcount', 'cumsum_searchcount']:
    if col not in df.columns:
        df[col] = 0.0

# Create route_key
df['route_key'] = df['srcid'].astype(str) + '_' + df['destid'].astype(str)

# Lag and rolling window features
for col in ['cumsum_seatcount', 'cumsum_searchcount']:
    df.sort_values(['route_key', 'doj'], inplace=True)
    df[f'{col}_lag1'] = df.groupby(['route_key'])[col].shift(1)
    df[f'{col}_rolling_mean3'] = df.groupby(['route_key'])[col].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    df[f'{col}_rolling_std3'] = df.groupby(['route_key'])[col].transform(lambda x: x.rolling(window=3, min_periods=1).std())

# Label Encode categorical if available
cat_cols = ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Split back to train/test
train_df = df[df['dataset'] == 'train'].copy()
test_df = df[df['dataset'] == 'test'].copy()

# Validation split before dropping datetime
val_start = pd.to_datetime("2023-11-01")
train_mask = train_df['doj'] < val_start
val_mask = train_df['doj'] >= val_start

# Drop datetime columns AFTER split
drop_cols = ['doj', 'doi'] if 'doi' in df.columns else ['doj']
train_df.drop(columns=drop_cols, inplace=True, errors='ignore')
test_df.drop(columns=drop_cols, inplace=True, errors='ignore')

X_full = train_df.drop(['final_seatcount','dataset','route_key'], axis=1)
y_full = train_df['final_seatcount']
X_test = test_df.drop(['final_seatcount','dataset','route_key'], axis=1)

In [8]:
# LightGBM for feature importance-based selection
lgb_sel = lgb.LGBMRegressor(n_estimators=500)
lgb_sel.fit(X_full.select_dtypes(exclude=['datetime64[ns]']), y_full)
feature_importance = pd.Series(lgb_sel.feature_importances_, index=X_full.select_dtypes(exclude=['datetime64[ns]']).columns)

# Select top features
top_1k = feature_importance.sort_values(ascending=False).head(1000).index.tolist()
top_2k = feature_importance.sort_values(ascending=False).head(2000).index.tolist()
top_3k = feature_importance.sort_values(ascending=False).head(3000).index.tolist()
top_6k = feature_importance.sort_values(ascending=False).head(6000).index.tolist()
feature_sets = {'1k': top_1k, '2k': top_2k, '3k': top_3k, '6k': top_6k}

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.237106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17212
[LightGBM] [Info] Number of data points in the train set: 67200, number of used features: 119
[LightGBM] [Info] Start training from score 2001.729464


In [10]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 

In [12]:
# Fill all remaining NaNs before model training
X_full.fillna(-1, inplace=True)
X_test.fillna(-1, inplace=True)
y_full.fillna(y_full.mean(), inplace=True)

In [13]:
# Train all models
preds_all = []
val_preds_all = []
for name, feat_set in feature_sets.items():
    X_train = X_full.loc[train_mask, feat_set]
    y_train = y_full[train_mask]
    X_val = X_full.loc[val_mask, feat_set]
    y_val = y_full[val_mask]
    X_t = X_test[feat_set]

    # LightGBM
    lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.035)
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    preds_all.append(lgb_model.predict(X_t))
    val_preds_all.append(lgb_model.predict(X_val))

    # XGBoost (manual early stopping omitted due to version constraints)
    xgb_model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.035, objective='reg:squarederror')
    xgb_model.fit(X_train, y_train)
    preds_all.append(xgb_model.predict(X_t))
    val_preds_all.append(xgb_model.predict(X_val))

    # CatBoost
    cb_model = cb.CatBoostRegressor(iterations=1000, learning_rate=0.035, verbose=False)
    cb_model.fit(X_train, y_train, eval_set=(X_val, y_val))
    preds_all.append(cb_model.predict(X_t))
    val_preds_all.append(cb_model.predict(X_val))

    # TABDPT: Using TabNet as a proxy (since HuggingFace models need transformer setup)
    from pytorch_tabnet.tab_model import TabNetRegressor
    import torch
    clf = TabNetRegressor(verbose=0)
    clf.fit(
        X_train.values, y_train.values.reshape(-1,1),
        eval_set=[(X_val.values, y_val.values.reshape(-1,1))],
        max_epochs=100,
        patience=20
    )
    preds_all.append(clf.predict(X_t.values).reshape(-1))
    val_preds_all.append(clf.predict(X_val.values).reshape(-1))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16802
[LightGBM] [Info] Number of data points in the train set: 24500, number of used features: 119
[LightGBM] [Info] Start training from score 1770.934408
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[184]	valid_0's rmse: 804.375	valid_0's l2: 647020

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_mse = 649461.38137
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16802
[LightGBM] [Info] Number of data points in the train set: 24500, number of used features:

In [15]:
# Final Predictions
final_preds = np.mean(preds_all, axis=0)
val_preds = np.mean(val_preds_all, axis=0)
val_score = np.sqrt(mean_squared_error(y_full[val_mask], val_preds))
print(f"Validation RMSE: {val_score:.4f}")

# Save Submission
submission = pd.DataFrame({
    'route_key': test_df['route_key'],
    'final_seatcount': final_preds
})
submission.to_csv('Prediction File.csv', index=False)
print("Saved as Prediction File.csv")


Validation RMSE: 791.0773
Saved as Prediction File.csv


In [16]:
from google.colab import files
files.download('Prediction File.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>