In [1]:
# Install necessary libraries
!pip install xgboost lightgbm catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRanker
from lightgbm import LGBMRanker
from catboost import CatBoostRanker, Pool

# Generate synthetic dataset suitable for ranking
X, y = make_classification(n_samples=500, n_features=10, random_state=42)

# Define groups (queries) for ranking
groups = np.array([50] * 10)  # 10 groups with 50 samples each

# Split dataset
group_train_size = sum(groups[:8])  # Use 8 groups for training
group_test_size = sum(groups[8:])  # Use 2 groups for testing

X_train, X_test = X[:group_train_size], X[group_train_size:]
y_train, y_test = y[:group_train_size], y[group_train_size:]
groups_train = groups[:8]
groups_test = groups[8:]

# CatBoost requires data to be passed as a Pool with group_id
group_ids_train = np.repeat(range(len(groups_train)), groups_train)
group_ids_test = np.repeat(range(len(groups_test)), groups_test)

train_pool = Pool(X_train, y_train, group_id=group_ids_train)
test_pool = Pool(X_test, y_test, group_id=group_ids_test)

# Initialize models
models = {
    "XGBoost": XGBRanker(objective="rank:pairwise"),
    "LightGBM": LGBMRanker(),
    "CatBoost": CatBoostRanker(verbose=0),
}

# Train and evaluate models
results = {}
for name, model in models.items():
    try:
        if name == "XGBoost":
            model.fit(X_train, y_train, group=groups_train.tolist())
        elif name == "LightGBM":
            model.fit(X_train, y_train, group=groups_train)
        elif name == "CatBoost":
            model.fit(train_pool)
        y_pred = model.predict(X_test if name != "CatBoost" else test_pool)
        mse = mean_squared_error(y_test, y_pred)
        results[name] = mse
        print(f"{name} MSE: {mse:.4f}")
    except Exception as e:
        print(f"{name} encountered an error: {e}")

# Display results
print("\nFinal Results:")
for model, mse in results.items():
    print(f"{model}: {mse:.4f}" if model in results else f"{model}: Error")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



XGBoost MSE: 13.3440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1336
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 10
LightGBM MSE: 22.1059
CatBoost MSE: 65.0668

Final Results:
XGBoost: 13.3440
LightGBM: 22.1059
CatBoost: 65.0668
