In [None]:
pip install --upgrade xgboost scikit-learn

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")
target = df.columns.tolist()[-1]
print(df.shape)
df.head()

(517754, 14)


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [4]:
def create_frequency_features(train_df, test_df, cols, num, cat):
    """
    Add frequency and binning features to the dataset.
    
    - For each column, create <col>_freq = how often each value appears in train data.
    - For numeric columns, split values into 5 and 10 quantile bins (groups) to show rank or range.
    """
    train, test = train_df.copy(), test_df.copy()

    for col in cols:
        # Frequency encoding: how common each value is
        freq = train[col].value_counts(normalize=True)
        train[f"{col}_freq"] = train[col].map(freq)
        test[f"{col}_freq"] = test[col].map(freq).fillna(train[f"{col}_freq"].mean())

        # Binning: group numeric values into quantiles
        if col in num:
            for q in [5, 10, 15]:
                try:
                    train[f"{col}_bin{q}"], bins = pd.qcut(train[col], q=q, labels=False, retbins=True, duplicates="drop")
                    test[f"{col}_bin{q}"] = pd.cut(test[col], bins=bins, labels=False, include_lowest=True)
                except Exception:
                    train[f"{col}_bin{q}"] = test[f"{col}_bin{q}"] = 0

    new_num = train.drop(columns=cat+[target]).columns.tolist()
    return train, test, new_num

# Data Processing

In [5]:
# Identify features (Unchanged)
cols = df.drop(columns=target).columns.tolist()

# Categorical features
cat = [col for col in cols if df[col].dtype in ["object","category"] and col != target]

# Numerical features
num = [col for col in cols if df[col].dtype not in ["object","category","bool"] and col not in ["id", target]]

# Creating new features based on the frequency of numerical features
df, df_test, new_num = create_frequency_features(df, df_test.copy(), cols, num, cat)

# Preparing categorical features
df[cat], df_test[cat] = df[cat].astype("category"), df_test[cat].astype("category")

# Mapping a column
map_col = "num_reported_accidents"
map_num_reported = {0:0, 1:0, 2:0, 3:2, 4:4, 5:3, 6:1, 7:0}
df[map_col] = df[map_col].map(map_num_reported)
df_test[map_col] = df_test[map_col].map(map_num_reported)

# Dropping unnecessary columns
remove = ["time_of_day", "num_lanes", "road_type", "road_signs_present", "id_freq"]
df = df.drop(columns=remove)
df_test = df_test.drop(columns=remove)

# Dropping ID and duplicates
df.drop(columns="id", inplace=True)
df.drop_duplicates(inplace=True)

# NEW: Add this function to create new features
def add_new_features(train_df, test_df, orig_df=None):
    """
    Add suggested new features to train and test DataFrames.
    If orig_df is provided, use it for mean/std encoding.
    """
    # Copy to avoid modifying originals
    train, test = train_df.copy(), test_df.copy()

    # 1. Interaction Feature: Speed * Curvature
    train['speed_curvature_interact'] = train['speed_limit'] * train['curvature']
    test['speed_curvature_interact'] = test['speed_limit'] * test['curvature']

    # 2. Binary Indicator: High Speed
    train['high_speed'] = (train['speed_limit'] >= 60).astype(int)
    test['high_speed'] = (test['speed_limit'] >= 60).astype(int)

    # 3. Risk Score (from AutoGluon inspiration)
    train['risk_score'] = (
        0.3 * train['curvature'] +
        0.2 * (train['lighting'] == 'night').astype(int) +
        0.1 * (train['weather'] != 'clear').astype(int) +
        0.2 * train['high_speed'] +  # Reuse the new high_speed
        0.1 * (train['num_reported_accidents'] > 2).astype(int)
    )
    test['risk_score'] = (
        0.3 * test['curvature'] +
        0.2 * (test['lighting'] == 'night').astype(int) +
        0.1 * (test['weather'] != 'clear').astype(int) +
        0.2 * test['high_speed'] +
        0.1 * (test['num_reported_accidents'] > 2).astype(int)
    )

    # Optional: Mean/Std Encoding from Original Data (if orig_df provided)
    if orig_df is not None:
        features_to_encode = ['road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather']
        for col in features_to_encode:
            mean_encode = orig_df.groupby(col)['accident_risk'].mean()  # Assuming target is 'accident_risk' in orig_df
            std_encode = orig_df.groupby(col)['accident_risk'].std()
            
            train[f'mean_{col}'] = train[col].map(mean_encode).fillna(mean_encode.mean())  # Fill NaN with global mean
            test[f'mean_{col}'] = test[col].map(mean_encode).fillna(mean_encode.mean())
            
            train[f'std_{col}'] = train[col].map(std_encode).fillna(std_encode.mean())
            test[f'std_{col}'] = test[col].map(std_encode).fillna(std_encode.mean())

    return train, test

# NEW: Call the function here (after existing processing, before DMatrix)
# Pass orig_df if you loaded it earlier; else omit it
df, df_test = add_new_features(df, df_test)  # Add , orig_df=orig_df if available

# %%
print(df.columns.tolist())  # (Unchanged, but now shows new features)

# %%
df.head()

['curvature', 'speed_limit', 'lighting', 'weather', 'public_road', 'holiday', 'school_season', 'num_reported_accidents', 'accident_risk', 'road_type_freq', 'num_lanes_freq', 'num_lanes_bin5', 'num_lanes_bin10', 'num_lanes_bin15', 'curvature_freq', 'curvature_bin5', 'curvature_bin10', 'curvature_bin15', 'speed_limit_freq', 'speed_limit_bin5', 'speed_limit_bin10', 'speed_limit_bin15', 'lighting_freq', 'weather_freq', 'road_signs_present_freq', 'public_road_freq', 'time_of_day_freq', 'holiday_freq', 'school_season_freq', 'num_reported_accidents_freq', 'num_reported_accidents_bin5', 'num_reported_accidents_bin10', 'num_reported_accidents_bin15', 'speed_curvature_interact', 'high_speed', 'risk_score']


Unnamed: 0,curvature,speed_limit,lighting,weather,public_road,holiday,school_season,num_reported_accidents,accident_risk,road_type_freq,...,time_of_day_freq,holiday_freq,school_season_freq,num_reported_accidents_freq,num_reported_accidents_bin5,num_reported_accidents_bin10,num_reported_accidents_bin15,speed_curvature_interact,high_speed,risk_score
0,0.06,35,daylight,rainy,True,False,True,0,0.13,0.330974,...,0.331252,0.496502,0.497514,0.404968,0,0,0,2.1,0,0.118
1,0.99,35,daylight,clear,False,True,True,0,0.35,0.330974,...,0.333821,0.503498,0.497514,0.241947,0,0,0,34.65,0,0.297
2,0.63,70,dim,clear,True,True,False,0,0.3,0.333593,...,0.334927,0.503498,0.502486,0.28192,1,1,1,44.1,1,0.389
3,0.07,35,dim,rainy,True,False,False,0,0.21,0.335433,...,0.334927,0.496502,0.502486,0.404968,0,0,0,2.45,0,0.121
4,0.58,60,daylight,foggy,False,True,False,0,0.56,0.333593,...,0.333821,0.503498,0.502486,0.404968,0,0,0,34.8,1,0.474


# CV score of the model

In [6]:
# Prepare DMatrix for XGBoost
dtrain = xgb.DMatrix(df.drop(columns=target), label=df[target], enable_categorical=True)

# Define XGBoost parameters
xgb_params = {
    'max_depth': 11, 'learning_rate': 0.011,
    'subsample': 0.82, 'colsample_bytree': 0.81,
    'min_child_weight': 3, 'gamma': 0.011,
    'reg_alpha': 0.12, 'reg_lambda': 0.4,
    'max_delta_step': 1, 'colsample_bylevel': 0.86,
    'colsample_bynode': 0.88, 'scale_pos_weight': 0.36,
    'max_bin': 512, 'tree_method': 'hist', "device":"cuda",
    'eval_metric': 'rmse', 'random_state': 42,
}

# Run cross-validation
cv_results = xgb.cv(
    params=xgb_params,
    dtrain=dtrain,
    nfold=5,
    num_boost_round=2000,
    metrics='rmse',
    verbose_eval=100,
    early_stopping_rounds=50
)

# Display last few CV results
print(cv_results.tail())

# Extract best boosting round
best_round = cv_results['test-rmse-mean'].idxmin()
best_rmse = cv_results['test-rmse-mean'][best_round]
print(f"Best round: {best_round}, Best CV RMSE: {best_rmse:.7f}")

[0]	train-rmse:0.16480+0.00005	test-rmse:0.16480+0.00019
[100]	train-rmse:0.07592+0.00009	test-rmse:0.07612+0.00023
[200]	train-rmse:0.05814+0.00005	test-rmse:0.05858+0.00026
[300]	train-rmse:0.05569+0.00005	test-rmse:0.05628+0.00024
[400]	train-rmse:0.05532+0.00005	test-rmse:0.05600+0.00023
[500]	train-rmse:0.05524+0.00005	test-rmse:0.05597+0.00023
[600]	train-rmse:0.05520+0.00005	test-rmse:0.05596+0.00023
[700]	train-rmse:0.05519+0.00005	test-rmse:0.05596+0.00023
[800]	train-rmse:0.05518+0.00005	test-rmse:0.05596+0.00023
[900]	train-rmse:0.05518+0.00005	test-rmse:0.05596+0.00023
[1000]	train-rmse:0.05518+0.00005	test-rmse:0.05596+0.00023
[1100]	train-rmse:0.05518+0.00005	test-rmse:0.05596+0.00023
[1116]	train-rmse:0.05518+0.00005	test-rmse:0.05596+0.00023
      train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
1062         0.055179        0.000051        0.055957       0.000227
1063         0.055179        0.000051        0.055957       0.000227
1064         0.055179    

In [7]:
# putting the n_estimator at the average early stopping point to avoid overfitting
last_round = len(cv_results) - 1
xgb_params["n_estimators"] = last_round + 10

# Final training and submitting

In [8]:
# Prepare training data
X_train = df.drop(columns=target)
y_train = df[target]

# Train XGBoost model
model = XGBRegressor(**xgb_params, enable_categorical=True)
model.fit(X_train, y_train)

# Predict on test set
pred = model.predict(df_test.drop(columns = "id"))

# Prepare submission
sub = pd.DataFrame({
    "id": df_test["id"],
    target: pred
})

# Save submission file
sub.to_csv("submission.csv", index=False)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [10]:
# Note: Ensure the 'model' variable (your trained XGBRegressor) is available in your current notebook session.

# 1. Save the final model using the JSON format (.json extension).
# This format is required because your model uses categorical splits.
model.save_model("model.json")

# 2. Save the original, unmodified training data
# We need this file for the processing pipeline in the app
df_original = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
df_original.to_csv("original_train_for_app.csv", index=False)

print("Files saved! Now download 'model.json' and 'original_train_for_app.csv'.")

Files saved! Now download 'model.json' and 'original_train_for_app.csv'.


Thanks to the Meta Models notebook, I obtained some of the feature engineering from it. The notebook is available here: [Link](http://www.kaggle.com/code/metamodels/single-simple-xgb-with-cv-0-05595)