In [1]:
# EDA and FE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Regression model
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Random Forest (tree-based) model
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, make_scorer
import joblib

# LightGBM
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer, mean_squared_error

In [3]:
# 0. Custom RMSLE scorer
rmse_scorer = make_scorer(
    mean_squared_error,
    greater_is_better=False,
    squared=False    # squared=False makes it return the root MSE
)

# 1. Load & copy
df = pd.read_csv('train.csv')
df_LightGBM = df.copy()

# 2. Rebuild your k-means bins (with explicit n_init to silence warnings)
hr_km = KMeans(n_clusters=4, n_init=10, random_state=42).fit(df_LightGBM[['Heart_Rate']])
age_km = KMeans(n_clusters=5, n_init=10, random_state=42).fit(df_LightGBM[['Age']])

# 2a. Map labels so they ascend by centroid value
def map_clusters(km, arr):
    cents   = km.cluster_centers_.flatten()
    order   = np.argsort(cents)
    mapping = {old: new for new, old in enumerate(order)}
    # apply mapping in pure Python → returns NumPy array
    return np.array([mapping[x] for x in arr])

df_LightGBM['hr_zone_km'] = map_clusters(
    hr_km, hr_km.predict(df_LightGBM[['Heart_Rate']])
)
df_LightGBM['age_km_bin'] = map_clusters(
    age_km, age_km.predict(df_LightGBM[['Age']])
)


# 3. Feature engineering
df_LightGBM['dur_temp']       = df_LightGBM['Duration'] * df_LightGBM['Body_Temp']
df_LightGBM['delta_temp']     = df_LightGBM['Body_Temp'] - 37
df_LightGBM['dur_over_delta'] = df_LightGBM['Duration'] / df_LightGBM['delta_temp'].replace(0, np.nan)

# 4. Split X/y
X = df_LightGBM[['dur_temp','dur_over_delta','delta_temp','hr_zone_km','age_km_bin']]
y = df_LightGBM['Calories']

# 5. Preprocessor: one-hot your cluster bins
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first', sparse_output=False), ['hr_zone_km','age_km_bin']),
], remainder='passthrough')

# 6. GPU-powered LightGBM regressor
lgbm = LGBMRegressor(
    device='gpu',            
    gpu_platform_id=0,       
    gpu_device_id=0,         
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    random_state=42
)

pipeline = Pipeline([
    ('prep', preprocessor),
    ('lgb',  lgbm),
])

# 7. Cross-validate with RMSLE
rmsle_scorer = make_scorer(
    lambda yt, yp: np.sqrt(mean_squared_log_error(yt, np.maximum(0, yp))),
    greater_is_better=False
)

lgb_rmsle = -cross_val_score(
    pipeline, X, y,
    cv=5,
    scoring=rmsle_scorer,
    n_jobs=-1
).mean()
print(f"LGBM RMSLE: {lgb_rmsle:.4f}")

# 8. Fit on all data
pipeline.fit(X, y)

# 9. Save for later
joblib.dump(pipeline, 'lgbm_calorie_gpu.pkl')

[LightGBM] [Info] This is the GPU trainer!![LightGBM] [Info] This is the GPU trainer!!

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Total Bins 551[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Total Bins 550
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 10

[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 10
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 10
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 10
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 548
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 10


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.9/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.9/site-packages/lightgbm/sklearn.py", line 1398, in fit
    super().fit(
  File "/usr/local/lib/python3.9/site-packages/lightgbm/sklearn.py", line 1049, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.9/site-packages/lightgbm/engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
  File "/usr/local/lib/python3.9/site-packages/lightgbm/basic.py", line 3660, in __init__
    _safe_call(
  File "/usr/local/lib/python3.9/site-packages/lightgbm/basic.py", line 313, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: No OpenCL device found


In [None]:
# Load saved LightGBM pipeline
pipeline = joblib.load('lgbm_calorie_gpu.pkl')

# Reload training data to refit K-means (for consistent bins)
df_train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
hr_km = KMeans(n_clusters=4, n_init=10, random_state=42).fit(df_train[['Heart_Rate']])
age_km = KMeans(n_clusters=5, n_init=10, random_state=42).fit(df_train[['Age']])
hr_centroids = hr_km.cluster_centers_.flatten()
hr_order = hr_centroids.argsort()
hr_map = {old:new for new,old in enumerate(hr_order)}
age_centroids = age_km.cluster_centers_.flatten()
age_order = age_centroids.argsort()
age_map = {old:new for new,old in enumerate(age_order)}

# Load & feature-engineer test set
df_test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')
df_test['dur_temp']       = df_test['Duration'] * df_test['Body_Temp']
df_test['delta_temp']     = df_test['Body_Temp'] - 37
df_test['dur_over_delta'] = df_test['Duration'] / df_test['delta_temp'].replace(0, pd.NA)

# Assign clusters
df_test['hr_zone_km']  = hr_km.predict(df_test[['Heart_Rate']])
df_test['hr_zone_km']  = df_test['hr_zone_km'].map(hr_map)
df_test['age_km_bin']  = age_km.predict(df_test[['Age']])
df_test['age_km_bin']  = df_test['age_km_bin'].map(age_map)

# Predict
features = ['dur_temp','dur_over_delta','delta_temp','hr_zone_km','age_km_bin']
preds = pipeline.predict(df_test[features])

# Build submission
submission = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
submission['Calories'] = preds
submission.to_csv('submission.csv', index=False)
print("submission.csv written")