In [26]:
import pandas as pd
import numpy as np

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split

In [27]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

TARGET = train.columns[-1]

X = train.drop(columns=[TARGET])
y = train[TARGET]

In [28]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include=np.number).columns

In [29]:
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols),
])

model = HistGradientBoostingRegressor(
    random_state=42,
    max_depth=6,
    learning_rate=0.05,
    max_iter=500
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model),
])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
val_preds = pipeline.predict(X_val)

rmse = root_mean_squared_error(y_val, val_preds)
rmse

8.775674182574766

In [30]:
pipeline.fit(X, y)
test_preds = pipeline.predict(test)

submission = pd.DataFrame({
    "id": test["id"],
    TARGET: test_preds
})
submission.to_csv("../submissions/submission_v2_hgb.csv", index=False)

In [34]:
# Try Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(random_state=42)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model),
])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
val_preds = pipeline.predict(X_val)

rmse = root_mean_squared_error(y_val, val_preds)
rmse

8.853389182802118

In [35]:
pipeline.fit(X, y)
test_preds = pipeline.predict(test)

submission = pd.DataFrame({
    "id": test["id"],
    TARGET: test_preds
})
submission.to_csv("../submissions/submission_v3_gb.csv", index=False)