In [1]:
import pandas as pd
from pathlib import Path
import holidays
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from time import time

# Function to encode dates
def _encode_dates(X):
    X = X.copy()
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    return X

# Function to preprocess data
def preprocess_data(data):
    data = data.copy()
    data = _encode_dates(data)
    data["weekend"] = (data["weekday"] > 4).astype(int)
    FR_holidays = holidays.FR(years=range(2019, 2022))
    data["FR_holidays"] = data["date"].dt.date.isin(FR_holidays).astype(int)
    return data

In [2]:

#%% Load and preprocess train data
#data = pd.read_parquet("/kaggle/input/msdb-2024/train.parquet") # to load on Kaggle 
data = pd.read_parquet(Path("data") / "train.parquet") # to load locally
train_data = preprocess_data(data)

weather_data = pd.read_csv(Path("data") / "external_data.csv")
weather_data["date"] = pd.to_datetime(weather_data["date"], errors="coerce")
weather_data = _encode_dates(weather_data)
weather_data = weather_data.drop_duplicates(subset="date")
weather_data.set_index("date", inplace=True)
weather_data = weather_data.resample("H").interpolate(method="linear")
weather_data.reset_index(inplace=True)

merged_data = pd.merge(train_data, weather_data, on="date", how="left")
merged_data = merged_data.loc[:, ~merged_data.columns.str.endswith(("_x", "_y"))]

def get_train_data(data = merged_data, target_column="log_bike_count"):
    data = data.sort_values(["date", "counter_name"])
    y_array = data[target_column].values
    X_df = data.drop([target_column, "bike_count"], axis=1)
    return X_df, y_array

X, y = get_train_data(merged_data)

# Split train and validation data
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    return X.loc[mask], y[mask], X.loc[~mask], y[~mask]

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)


  weather_data = weather_data.resample("H").interpolate(method="linear")


In [30]:

#%% Define preprocessing pipeline
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()
date_cols = [col for col in date_cols if col != "day"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

numerical_encoder = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)
numerical_corr_cols = ["u", "t", "tx12", "tn12", "rafper", "td", "raf10", "ff", "nnuage3", "vv"]

binary_cols = ["weekend", "FR_holidays"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)



In [5]:
# Prepare test data

#test_data = pd.read_parquet("/kaggle/input/msdb-2024/final_test.parquet") # to load on Kaggle
test_data = pd.read_parquet(Path("data") / "final_test.parquet") # to load locally
test_data = preprocess_data(test_data)

merged_data = pd.merge(test_data, weather_data, on="date", how="left") # merge test and weather data
merged_data = merged_data.loc[:, ~merged_data.columns.str.endswith(("_x", "_y"))] # Drop redundant date columns 


# 1. GradientBoostingRegressor

In [13]:
# 1. GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor(random_state=42, max_depth=6, n_estimators=100)

# Without weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting without weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_gradient_without_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")


Training time for GradientBoosting without weather: 130.51 seconds




Train set, RMSE=0.66
Valid set, RMSE=0.63
Predictions saved to submission_gradient_without_weather.csv


In [None]:


# With weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_gradient_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 270.77 seconds




Train set, RMSE=0.65




Valid set, RMSE=0.57
Predictions saved to submission_gradient_with_weather.csv


# 2. XGB Regressor

In [6]:
# 2. XGB Regressor
# With weather data

regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=100)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 14.90 seconds




Train set, RMSE=0.41




Valid set, RMSE=0.51
Predictions saved to submission_XGB_with_weather.csv


## Best model so far

In [13]:
# 2. XGB Regressor
# With weather data 
# best so far with 0.7669

regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=200, learning_rate = 0.5)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 9.72 seconds




Train set, RMSE=0.40
Valid set, RMSE=0.49
Predictions saved to submission_XGB_with_weather.csv


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Define the parameter grid for XGBRegressor
param_grid = {
    'xgbregressor__learning_rate': [0.3, 0.5, 0.7],
    'xgbregressor__max_depth': [8, 10, 12],
    'xgbregressor__n_estimators': [100, 200, 300],
}

# Create the pipeline again for GridSearch
regressor = XGBRegressor(random_state=42)

pipe = make_pipeline(date_encoder, preprocessor, regressor)

# Define a custom scorer for neg RMSE (lower is better)
scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)

# Perform GridSearchCV
grid_search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring=scorer,
    cv=3,  # Use 3-fold CV
    n_jobs=-1,  # Parallelize across CPUs
    verbose=2  # Show progress
)

# Fit the grid search
print("Starting Grid Search...")
grid_search.fit(X_train, y_train)

# Display the best parameters and their performance
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Train RMSE: {-grid_search.best_score_:.4f}")

# Evaluate each combination with train and valid RMSE
print("\nGrid Search Results:")
for mean_score, params in zip(grid_search.cv_results_["mean_test_score"], grid_search.cv_results_["params"]):
    train_rmse = -mean_score
    print(f"Params: {params}, Train RMSE: {train_rmse:.4f}")


Starting Grid Search...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}
Best Train RMSE: 0.8887

Grid Search Results:
Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.9285
Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.9316
Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.9305
Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.8887
Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.8964
Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_esti

In [15]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
from time import time

# Define the parameter grid for XGBRegressor
param_grid = {
    'xgbregressor__learning_rate': [0.3, 0.4, 0.5, 0.6, 0.7],
    'xgbregressor__max_depth': [8, 10, 12],
    'xgbregressor__n_estimators': [100, 200, 300, 350],
}

# Create the pipeline
regressor = XGBRegressor(random_state=42)
pipe = make_pipeline(date_encoder, preprocessor, regressor)

# Loop over all parameter combinations
best_params = None
best_valid_rmse = float("inf")
results = []

print("Starting Grid Search...")
start_time = time()

for params in ParameterGrid(param_grid):
    # Update the regressor with current params
    pipe.set_params(**params)
    
    # Fit the pipeline on the training data
    pipe.fit(X_train, y_train)
    
    # Compute RMSE on train and validation sets
    train_rmse = mean_squared_error(y_train, pipe.predict(X_train), squared=False)
    valid_rmse = mean_squared_error(y_valid, pipe.predict(X_valid), squared=False)
    
    # Store results
    results.append((params, train_rmse, valid_rmse))
    
    # Update best parameters if current valid RMSE is lower
    if valid_rmse < best_valid_rmse:
        best_valid_rmse = valid_rmse
        best_params = params
    
    print(f"Params: {params}, Train RMSE: {train_rmse:.4f}, Valid RMSE: {valid_rmse:.4f}")

elapsed_time = time() - start_time
print(f"\nGrid Search completed in {elapsed_time / 60:.2f} minutes")
print(f"Best Parameters: {best_params}")
print(f"Best Validation RMSE: {best_valid_rmse:.4f}")

# Save all results to a DataFrame for further inspection
results_df = pd.DataFrame(results, columns=["Params", "Train RMSE", "Valid RMSE"])
results_df.to_csv("grid_search_results.csv", index=False)
print("Grid search results saved to 'grid_search_results.csv'")


Starting Grid Search...




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4641, Valid RMSE: 0.4871




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4438, Valid RMSE: 0.4834




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.4303, Valid RMSE: 0.4839




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.4243, Valid RMSE: 0.4838




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4147, Valid RMSE: 0.5055




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3991, Valid RMSE: 0.5021




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3899, Valid RMSE: 0.5011




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3820, Valid RMSE: 0.4995




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.3723, Valid RMSE: 0.4934




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3518, Valid RMSE: 0.4900




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3418, Valid RMSE: 0.4887




Params: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3384, Valid RMSE: 0.4887




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4590, Valid RMSE: 0.5093




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4372, Valid RMSE: 0.5060




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.4263, Valid RMSE: 0.5068




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.4168, Valid RMSE: 0.5044




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4233, Valid RMSE: 0.5054




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3977, Valid RMSE: 0.4962




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3843, Valid RMSE: 0.4917




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3789, Valid RMSE: 0.4913




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.3836, Valid RMSE: 0.5362




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3555, Valid RMSE: 0.5321




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3404, Valid RMSE: 0.5303




Params: {'xgbregressor__learning_rate': 0.4, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3360, Valid RMSE: 0.5304




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4645, Valid RMSE: 0.5291




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4400, Valid RMSE: 0.5245




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.4284, Valid RMSE: 0.5227




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.4228, Valid RMSE: 0.5220




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4214, Valid RMSE: 0.4961




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4024, Valid RMSE: 0.4914




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3850, Valid RMSE: 0.4911




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3775, Valid RMSE: 0.4891




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.3888, Valid RMSE: 0.5147




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3572, Valid RMSE: 0.5099




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3386, Valid RMSE: 0.5106




Params: {'xgbregressor__learning_rate': 0.5, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3340, Valid RMSE: 0.5102




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4590, Valid RMSE: 0.5214




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4311, Valid RMSE: 0.5174




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.4181, Valid RMSE: 0.5162




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.4129, Valid RMSE: 0.5161




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4232, Valid RMSE: 0.5143




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4035, Valid RMSE: 0.5116




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3835, Valid RMSE: 0.5078




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3676, Valid RMSE: 0.5052




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.3817, Valid RMSE: 0.5252




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3516, Valid RMSE: 0.5218




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3371, Valid RMSE: 0.5209




Params: {'xgbregressor__learning_rate': 0.6, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3300, Valid RMSE: 0.5215




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4619, Valid RMSE: 0.5298




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.4282, Valid RMSE: 0.5266




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.4114, Valid RMSE: 0.5237




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.4065, Valid RMSE: 0.5239




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.4282, Valid RMSE: 0.5183




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3916, Valid RMSE: 0.5108




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3743, Valid RMSE: 0.5090




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3682, Valid RMSE: 0.5086




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 100}, Train RMSE: 0.3849, Valid RMSE: 0.5523




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 200}, Train RMSE: 0.3579, Valid RMSE: 0.5516




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 300}, Train RMSE: 0.3405, Valid RMSE: 0.5511




Params: {'xgbregressor__learning_rate': 0.7, 'xgbregressor__max_depth': 12, 'xgbregressor__n_estimators': 350}, Train RMSE: 0.3295, Valid RMSE: 0.5491

Grid Search completed in 20.82 minutes
Best Parameters: {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 200}
Best Validation RMSE: 0.4834
Grid search results saved to 'grid_search_results.csv'




In [16]:
# 2. XGB Regressor
# With weather data 

regressor = XGBRegressor(random_state=42, max_depth=8, n_estimators=200, learning_rate = 0.3)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_with_weather_d=8,n=200,l=0.3.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 7.30 seconds




Train set, RMSE=0.44
Valid set, RMSE=0.48
Predictions saved to submission_XGB_with_weather_d=8,n=200,l=0.3.csv


## Using all weather data, did not help

In [None]:
# 2. XGB Regressor
# With ALL weather data (did not imporve)


#%% Define preprocessing pipeline
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()
date_cols = [col for col in date_cols if col != "day"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

numerical_encoder = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

numerical_corr_cols = weather_data.drop(columns=["date", "year", "month", "day", "weekday", "hour"]).columns

binary_cols = ["weekend", "FR_holidays"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)



regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=200, learning_rate = 0.5)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


 'phenspe3' 'phenspe4']. At least one non-missing value is needed for imputation with strategy='mean'.


Training time for GradientBoosting with weather: 24.34 seconds


 'phenspe3' 'phenspe4']. At least one non-missing value is needed for imputation with strategy='mean'.


Train set, RMSE=0.36


 'phenspe3' 'phenspe4']. At least one non-missing value is needed for imputation with strategy='mean'.


Valid set, RMSE=0.56


 'phenspe3' 'phenspe4']. At least one non-missing value is needed for imputation with strategy='mean'.


Predictions saved to submission_XGB_with_weather.csv


## Completely without weather, did not help

In [31]:

#%% Define preprocessing pipeline
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()
date_cols = [col for col in date_cols if col != "day"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

numerical_encoder = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)
numerical_corr_cols = ["u", "t", "tx12", "tn12", "rafper", "td", "raf10", "ff", "nnuage3", "vv"]

binary_cols = ["weekend", "FR_holidays"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)


regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=200, learning_rate = 0.5)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_without_weather,d=10,n=200,l=0.5.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 12.44 seconds




Train set, RMSE=0.45




Valid set, RMSE=0.51
Predictions saved to submission_XGB_without_weather,d=10,n=200,l=0.5.csv


## XGB, dropping zero-valued rows for prediction

In [None]:
(data["log_bike_count"] == 0).sum

41401

In [45]:
print((y_train == 0).sum())
print((y_valid == 0). sum())

40164
1237


There are 40k zero bike counts!

In [99]:
# Slightly different data loading

#%% Load and preprocess train data
#data = pd.read_parquet("/kaggle/input/msdb-2024/train.parquet") # to load on Kaggle 
data = pd.read_parquet(Path("data") / "train.parquet") # to load locally
train_data = preprocess_data(data)

weather_data = pd.read_csv(Path("data") / "external_data.csv")
weather_data["date"] = pd.to_datetime(weather_data["date"], errors="coerce")
weather_data = _encode_dates(weather_data)
weather_data = weather_data.drop_duplicates(subset="date")
weather_data.set_index("date", inplace=True)
weather_data = weather_data.resample("H").interpolate(method="linear")
weather_data.reset_index(inplace=True)

merged_data = pd.merge(train_data, weather_data, on="date", how="left")
merged_data = merged_data.loc[:, ~merged_data.columns.str.endswith(("_x", "_y"))]

def get_train_data(data=merged_data, target_column="log_bike_count"):
    data = data.sort_values(["date", "counter_name"])
    y_array = data[target_column].values
    # Keep the `bike_count` column for filtering in train-test split
    X_df = data.drop([target_column], axis=1)
    return X_df, y_array

X, y = get_train_data(merged_data)

# Split train and validation data
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date

    # Split train and validation data
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    # Drop rows where bike_count == 0 in the training data
    train_mask = X_train["bike_count"] != 0
    X_train = X_train.loc[train_mask]
    y_train = y_train[train_mask]

    # Drop `bike_count` from training and validation features after filtering
    X_train = X_train.drop(["bike_count"], axis=1)
    X_valid = X_valid.drop(["bike_count"], axis=1)

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)


  weather_data = weather_data.resample("H").interpolate(method="linear")


In [100]:
print((y_train == 0).sum())
print((y_valid == 0). sum())

0
1237


In [3]:
# %% [code]
#%%

import pandas as pd
from pathlib import Path
import holidays
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from time import time

# Function to encode dates
def _encode_dates(X):
    X = X.copy()
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    return X

# Function to preprocess data
def preprocess_data(data):
    data = data.copy()
    data = _encode_dates(data)
    data["weekend"] = (data["weekday"] > 4).astype(int)
    FR_holidays = holidays.FR(years=range(2019, 2022))
    data["FR_holidays"] = data["date"].dt.date.isin(FR_holidays).astype(int)
    return data

#%% Load and preprocess train data
#data = pd.read_parquet("/kaggle/input/msdb-2024/train.parquet") # to load on Kaggle 
data = pd.read_parquet(Path("data") / "train.parquet") # to load locally
train_data = preprocess_data(data)

weather_data = pd.read_csv("/kaggle/input/msdb-2024/external_data.csv") # to load on Kaggle
#weather_data = pd.read_csv(Path("data") / "external_data.csv")
weather_data["date"] = pd.to_datetime(weather_data["date"], errors="coerce")
weather_data = _encode_dates(weather_data)
weather_data = weather_data.drop_duplicates(subset="date")
weather_data.set_index("date", inplace=True)
weather_data = weather_data.resample("H").interpolate(method="linear")
weather_data.reset_index(inplace=True)

merged_data = pd.merge(train_data, weather_data, on="date", how="left")
merged_data = merged_data.loc[:, ~merged_data.columns.str.endswith(("_x", "_y"))]

def get_train_data(data=merged_data, target_column="log_bike_count"):
    data = data.sort_values(["date", "counter_name"])
    y_array = data[target_column].values
    # Keep the `bike_count` column for filtering in train-test split
    X_df = data.drop([target_column], axis=1)
    return X_df, y_array

X, y = get_train_data(merged_data)

# Split train and validation data
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date

    # Split train and validation data
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    # Drop rows where bike_count == 0 in the training data
    train_mask = X_train["bike_count"] != 0
    X_train = X_train.loc[train_mask]
    y_train = y_train[train_mask]

    # Drop `bike_count` from training and validation features after filtering
    X_train = X_train.drop(["bike_count"], axis=1)
    X_valid = X_valid.drop(["bike_count"], axis=1)

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)


#%% Define preprocessing pipeline
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()
#date_cols = [col for col in date_cols if col != "day"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

numerical_encoder = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)
numerical_corr_cols = ["u", "t", "tx12", "tn12", "rafper", "td", "raf10", "ff", "nnuage3", "vv"]

binary_cols = ["weekend", "FR_holidays"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=200, learning_rate=0.3)


#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

test_data = pd.read_parquet("/kaggle/input/msdb-2024/final_test.parquet") # to load on Kaggle
#test_data = pd.read_parquet(Path("data") / "final_test.parquet") # to load locally
test_data = preprocess_data(test_data)

merged_data = pd.merge(test_data, weather_data, on="date", how="left") # merge test and weather data
merged_data = merged_data.loc[:, ~merged_data.columns.str.endswith(("_x", "_y"))] # Drop redundant date columns 


X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_final.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%

pipe

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/msdb-2024/train.parquet'

## Dropping rows only if the whole day was defect

In [90]:
print((y_train == 0).sum())
print((y_valid == 0). sum())

40164
1237


In [76]:
# Best model without weather data
# 0.72 for  depth = 10, n = 100, lr = 0.3


#%% Define preprocessing pipeline
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()
date_cols = [col for col in date_cols if col != "day"]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

numerical_encoder = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)
numerical_corr_cols = ["u", "t", "tx12", "tn12", "rafper", "td", "raf10", "ff", "nnuage3", "vv"]

binary_cols = ["weekend", "FR_holidays"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)


# 2. XGB Regressor
# Without weather data

regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=100, learning_rate=0.3)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions

#test_data = pd.read_parquet("/kaggle/input/msdb-2024/final_test.parquet") # to load on Kaggle
test_data = pd.read_parquet(Path("data") / "final_test.parquet") # to load locally
test_data = preprocess_data(test_data)

merged_data = pd.merge(test_data, weather_data, on="date", how="left") # merge test and weather data
merged_data = merged_data.loc[:, ~merged_data.columns.str.endswith(("_x", "_y"))] # Drop redundant date columns 


X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_without_weather_day_zeros_dropped,d=10,n=100,l=0.3.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%



Training time for GradientBoosting with weather: 6.30 seconds




Train set, RMSE=0.40
Valid set, RMSE=0.48
Predictions saved to submission_XGB_without_weather_day_zeros_dropped,d=10,n=100,l=0.3.csv


## Below, too many n_estimators and higher learning rate did not help

In [11]:
# 2. XGB Regressor
# With weather data

regressor = XGBRegressor(random_state=42, max_depth=10, n_estimators=500, learning_rate = 0.5)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_with_weather_d=10,n=500,l=0.5.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 20.27 seconds




Train set, RMSE=0.37




Valid set, RMSE=0.49
Predictions saved to submission_XGB_with_weather_d=10,n=500,l=0.5.csv


In [12]:
# 2. XGB Regressor
# With weather data

regressor = XGBRegressor(random_state=42, max_depth=12, n_estimators=500, learning_rate = 0.6)

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for GradientBoosting with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_XGB_with_weather_d=12,n=500,l=0.6.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%


Training time for GradientBoosting with weather: 36.26 seconds




Train set, RMSE=0.31




Valid set, RMSE=0.52
Predictions saved to submission_XGB_with_weather_d=12,n=500,l=0.6.csv


# 2. SVR

In [None]:
## NOT FINISHED AFTER 2 HOURS

# # 2. SVR
# from sklearn.svm import SVR

# regressor = SVR(kernel='rbf', C=1.0, epsilon=0.2)

# # Without weather data
# preprocessor = ColumnTransformer(
#     [
#         ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
#         ("cat", categorical_encoder, categorical_cols),
#         #("num", numerical_encoder, numerical_corr_cols),
#         ("binary", "passthrough", binary_cols)
#     ]
# )

# #%%
# start = time()
# pipe = make_pipeline(date_encoder, preprocessor, regressor)
# pipe.fit(X_train, y_train)
# elapsed_time = time() - start
# print(f"Training time for SVR without weather: {elapsed_time:.2f} seconds")
# print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
# print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# # Save predictions
# X_test = merged_data
# y_pred = pipe.predict(X_test)
# results = pd.DataFrame(
#     dict(
#         Id=np.arange(y_pred.shape[0]),
#         log_bike_count=y_pred,
#     )
# )
# output_path = "submission_svr_without_weather.csv"
# results.to_csv(output_path, index=False)
# print(f"Predictions saved to {output_path}")

# # With weather data
# preprocessor = ColumnTransformer(
#     [
#         ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
#         ("cat", categorical_encoder, categorical_cols),
#         ("num", numerical_encoder, numerical_corr_cols),
#         ("binary", "passthrough", binary_cols)
#     ]
# )

# #%%
# start = time()
# pipe = make_pipeline(date_encoder, preprocessor, regressor)
# pipe.fit(X_train, y_train)
# elapsed_time = time() - start
# print(f"Training time for SVR with weather: {elapsed_time:.2f} seconds")
# print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
# print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# # Save predictions
# X_test = merged_data
# y_pred = pipe.predict(X_test)
# results = pd.DataFrame(
#     dict(
#         Id=np.arange(y_pred.shape[0]),
#         log_bike_count=y_pred,
#     )
# )
# output_path = "submission_svr_with_weather.csv"
# results.to_csv(output_path, index=False)
# print(f"Predictions saved to {output_path}")

# #%%


# 3. KNeighborsRegressor

In [None]:

# 3. KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor

regressor = KNeighborsRegressor(n_neighbors=5, weights='distance', n_jobs=-1)

# Without weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for KNN without weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_knn_without_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")



In [None]:

# With weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for KNN with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_knn_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%

# 4. ElasticNet

In [None]:

# 4. ElasticNet
from sklearn.linear_model import ElasticNet

regressor = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)

# Without weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for ElasticNet without weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_elasticnet_without_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")



In [None]:

# With weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for ElasticNet with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_elasticnet_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

#%%

# 5. AdaBoost

In [None]:

# 5. AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor

regressor = AdaBoostRegressor(random_state=42, n_estimators=50)

# Without weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        #("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for AdaBoost without weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_adaboost_without_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")


In [None]:

# With weather data
preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", numerical_encoder, numerical_corr_cols),
        ("binary", "passthrough", binary_cols)
    ]
)

#%%
start = time()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)
elapsed_time = time() - start
print(f"Training time for AdaBoost with weather: {elapsed_time:.2f} seconds")
print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# Save predictions
X_test = merged_data
y_pred = pipe.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
output_path = "submission_adaboost_with_weather.csv"
results.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

# 6. Decision Tree Regressor

In [None]:

# # 6. DecisionTreeRegressor
# from sklearn.tree import DecisionTreeRegressor

# regressor = DecisionTreeRegressor(random_state=42, max_depth=10)

# # Without weather data
# preprocessor = ColumnTransformer(
#     [
#         ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
#         ("cat", categorical_encoder, categorical_cols),
#         #("num", numerical_encoder, numerical_corr_cols),
#         ("binary", "passthrough", binary_cols)
#     ]
# )

# #%%
# start = time()
# pipe = make_pipeline(date_encoder, preprocessor, regressor)
# pipe.fit(X_train, y_train)
# elapsed_time = time() - start
# print(f"Training time for DecisionTree without weather: {elapsed_time:.2f} seconds")
# print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
# print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# # Save predictions
# X_test = merged_data
# y_pred = pipe.predict(X_test)
# results = pd.DataFrame(
#     dict(
#         Id=np.arange(y_pred.shape[0]),
#         log_bike_count=y_pred,
#     )
# )
# output_path = "submission_decisiontree_without_weather.csv"
# results.to_csv(output_path, index=False)
# print(f"Predictions saved to {output_path}")

# # With weather data
# preprocessor = ColumnTransformer(
#     [
#         ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
#         ("cat", categorical_encoder, categorical_cols),
#         ("num", numerical_encoder, numerical_corr_cols),
#         ("binary", "passthrough", binary_cols)
#     ]
# )

# #%%
# start = time()
# pipe = make_pipeline(date_encoder, preprocessor, regressor)
# pipe.fit(X_train, y_train)
# elapsed_time = time() - start
# print(f"Training time for DecisionTree with weather: {elapsed_time:.2f} seconds")
# print(f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}")
# print(f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}")

# # Save predictions
# X_test = merged_data
# y_pred = pipe.predict(X_test)
# results = pd.DataFrame(
#     dict(
#         Id=np.arange(y_pred.shape[0]),
#         log_bike_count=y_pred,
#     )
# )
# output_path = "submission_decisiontree_with_weather.csv"
# results.to_csv(output_path, index=False)
# print(f"Predictions saved to {output_path}")

# #%%
