<a href="https://www.kaggle.com/code/honeykaggle/heart-rate-forecasting?scriptVersionId=213227989" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import Pipeline

In [2]:
train_path = '/kaggle/input/hackmachine/Dataset/Train.csv'
test_path = '/kaggle/input/hackmachine/Dataset/Test.csv'
submission_path = '/kaggle/input/hackmachine/Dataset/Submission.csv'

In [3]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
submission_sample = pd.read_csv(submission_path)

In [4]:
if 'Unnamed: 0' in train_data.columns:
    train_data.drop(columns=['Unnamed: 0'], inplace=True)
if 'Unnamed: 0' in test_data.columns:
    test_data.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
label_encoder = LabelEncoder()
train_data["Patient's condition"] = label_encoder.fit_transform(train_data["Patient's condition"])
test_data["Patient's condition"] = label_encoder.transform(test_data["Patient's condition"])


In [6]:
q1 = train_data['Heart_Rate'].quantile(0.25)
q3 = train_data['Heart_Rate'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
train_data = train_data[(train_data['Heart_Rate'] >= lower_bound) & (train_data['Heart_Rate'] <= upper_bound)]

In [7]:
X = train_data.drop(columns=['Heart_Rate'])
y = train_data['Heart_Rate']

In [8]:
X['mean_feature'] = X.mean(axis=1)
X['std_feature'] = X.std(axis=1)
X['max_feature'] = X.max(axis=1)
X['min_feature'] = X.min(axis=1)

In [9]:
for lag in range(1, 4):  # 3 lag features
    X[f'lag_{lag}'] = y.shift(lag)
    test_data[f'lag_{lag}'] = np.nan

In [10]:
X.fillna(method='bfill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

  X.fillna(method='bfill', inplace=True)
  test_data.fillna(method='bfill', inplace=True)


In [11]:
X['range_feature'] = X['max_feature'] - X['min_feature']
test_data['mean_feature'] = test_data.mean(axis=1)
test_data['std_feature'] = test_data.std(axis=1)
test_data['max_feature'] = test_data.max(axis=1)
test_data['min_feature'] = test_data.min(axis=1)
test_data['range_feature'] = test_data['max_feature'] - test_data['min_feature']

In [12]:
# Align test_data columns with training data (X) columns
test_data_aligned = test_data[X.columns]  # Ensure test_data has the same columns as X

# Normalizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit and transform on training data
X_test_scaled = scaler.transform(test_data_aligned)  # Transform test data


In [13]:
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')
lgb_model = LGBMRegressor(random_state=42)

In [14]:
xgb_param_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [1.0, 1.5],
    'min_child_weight': [1, 5, 10],
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_grid,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [15]:
random_search.fit(X_scaled, y)
best_xgb = random_search.best_estimator_
print("Best XGBoost Parameters:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best XGBoost Parameters: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 1000, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.8}


In [16]:
stacking_model = StackingRegressor(
    estimators=[('xgb', best_xgb), ('lgb', lgb_model)],
    final_estimator=XGBRegressor(random_state=42, n_estimators=100)
)

In [17]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = -cross_val_score(stacking_model, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(cv_scores.mean())
print(f"Stacking Model Cross-Validation RMSE: {cv_rmse}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9520
[LightGBM] [Info] Number of data points in the train set: 145469, number of used features: 39
[LightGBM] [Info] Start training from score 73.343832
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9520
[LightGBM] [Info] Number of data points in the train set: 116375, number of used features: 39
[LightGBM] [Info] Start training from score 73.339887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034778 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9520
[LightGBM] [Info] Number of data points in the train set: 116375, number of used features: 39
[LightGBM] [Info] Star

In [18]:
stacking_model.fit(X_scaled, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9521
[LightGBM] [Info] Number of data points in the train set: 181837, number of used features: 39
[LightGBM] [Info] Start training from score 73.353730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9521
[LightGBM] [Info] Number of data points in the train set: 145469, number of used features: 39
[LightGBM] [Info] Start training from score 73.352397
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9520
[LightGBM] [Info] Number of data points in the train set: 145469, number of used features: 39
[LightGBM] [Info] Star

In [19]:
test_predictions = stacking_model.predict(X_test_scaled)

In [20]:
submission_sample['Heart_Rate'] = test_predictions
submission_sample.to_csv('Final_Submission.csv', index=False)

print("Submission file saved as 'Final_Submission.csv'.")

Submission file saved as 'Final_Submission.csv'.
