In [1]:
from __future__ import annotations

import catboost
import lightgbm
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import xgboost
from autogluon.tabular import TabularPredictor
from category_encoders.target_encoder import TargetEncoder
from optuna.samplers import TPESampler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
)
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import make_scorer, root_mean_squared_error
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from ydata_profiling import ProfileReport

In [2]:
train_df = pd.read_csv("./data/train.csv")
train_df["dataset"] = "train"
test_df = pd.read_csv("./data/test.csv")
test_df["dataset"] = "test"
kaggle_id = test_df["id"].copy()


def extract_trasnmission(x):
    if "A/T" in x or "AT" in x or "Automatic" in x:
        return "Automatic"
    elif "Mt" in x or "M/T" in x or "Manual" in x or "Transmission w/Dual Shift Mode" in x:
        return "Manual"
    else:
        return x


def preprocess_rawdata(df):
    df = df.copy()

    df.loc[df["brand"] == "Tesla", "fuel_type"] = "electricity"

    df["age"] = 2024 - df["model_year"]

    df["milage_1"] = df["milage"] // df["age"]
    df.loc[df["milage_1"] == np.inf, "milage_1"] = 0.0

    df.replace("-", np.nan, inplace=True)
    df["transmission_1"] = df["transmission"].map(extract_trasnmission)
    return df


categorical_features = [
    "brand",
    "model",
    "fuel_type",
    "accident",
    "transmission_1",
]  # , "transmission" "clean_title"
numeric_features = ["age", "milage", "milage_1"]
y = "price"

train_df = preprocess_rawdata(train_df)
test_df = preprocess_rawdata(test_df)

train_x = train_df[categorical_features + numeric_features].copy()
train_y = train_df[y].copy()
test_x = test_df[categorical_features + numeric_features].copy()

In [None]:
def MixedPipeline(train_x, train_y, test_x, test_y=None):
    encoder = TargetEncoder(cols=categorical_features).set_output(transform="pandas")
    encoder.fit(train_x, train_y)
    train_x = encoder.transform(train_x)
    test_x = encoder.transform(test_x)

    scaler = StandardScaler().set_output(transform="pandas")
    scaler.fit(train_x, train_y)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    assert isinstance(train_x, pd.DataFrame)
    assert isinstance(test_x, pd.DataFrame)

    # xgboost.XGBRegressor
    # 75287
    xgbr = xgboost.XGBRegressor(random_state=9999)
    xgbr.fit(train_x, train_y)
    xgbr_test_yhat = xgbr.predict(test_x)

    # lightgbm.LGBMRegressor
    # 73042
    lgbmr = lightgbm.LGBMRegressor(random_state=9999, verbose=0)
    lgbmr.fit(train_x, train_y)
    lgbmr_test_yhat = lgbmr.predict(test_x)

    # catboost.CatBoostRegressor
    # 74042
    catbr = catboost.CatBoostRegressor(random_state=9999, verbose=0)
    catbr.fit(train_x, train_y)
    catbr_test_yhat = catbr.predict(test_x)

    # Voting
    voting_test_yhat = (
        xgbr.predict(test_x) * 75287 + lgbmr.predict(test_x) * 73042 + catbr.predict(test_x) * 74042
    ) / (75287 + 73042 + 74042)

    # Stacking
    meta_train_features = pd.DataFrame(
        {
            "xgboost.XGBRegressor": xgbr.predict(train_x),
            "lightgbm.LGBMRegressor": lgbmr.predict(train_x),
            "catboost.CatBoostRegressor": catbr.predict(train_x),
        }
    )
    meta_train_x = pd.concat(
        [
            train_x.reset_index(drop=True),
            meta_train_features,
        ],
        axis=1,
    )
    # meta_learner = catboost.CatBoostRegressor(random_state=9999, verbose=0)
    # meta_learner = xgboost.XGBRegressor(random_state=9999)
    meta_learner = Lasso(random_state=9999)
    meta_learner.fit(meta_train_x, train_y)
    meta_test_features = pd.DataFrame(
        {
            "xgboost.XGBRegressor": xgbr.predict(test_x),
            "lightgbm.LGBMRegressor": lgbmr.predict(test_x),
            "catboost.CatBoostRegressor": catbr.predict(test_x),
        }
    )
    meta_test_x = pd.concat(
        [
            test_x.reset_index(drop=True),
            meta_test_features,
        ],
        axis=1,
    )
    meta_test_yhat = meta_learner.predict(meta_test_x)

    return pd.DataFrame(
        {
            "xgboost.XGBRegressor": xgbr_test_yhat,
            "lightgbm.LGBMRegressor": lgbmr_test_yhat,
            "catboost.CatBoostRegressor": catbr_test_yhat,
            "meataLearner": meta_test_yhat,
            "voting": voting_test_yhat,
        }
    )


# score_list = []
# cv = KFold(n_splits=20)
# for train_idx, test_idx in tqdm(list(cv.split(range(len(train_df))))):
#     cv_train_x = train_x.iloc[train_idx].copy()
#     cv_train_y = train_y.iloc[train_idx].copy()
#     cv_test_x = train_x.iloc[test_idx].copy()
#     cv_test_y = train_y.iloc[test_idx].copy()
#     model_pred_df = MixedPipeline(cv_train_x, cv_train_y, cv_test_x)
#     score_list.append(
#         {col: root_mean_squared_error(cv_test_y, model_pred_df[col]) for col in model_pred_df.columns}
#     )
# pd.DataFrame(score_list).mean(axis=0)

model_pred_df = MixedPipeline(train_x, train_y, test_x)
preds = model_pred_df["lightgbm.LGBMRegressor"]
submission_df = pd.DataFrame(
    {
        "id": kaggle_id,
        "price": preds,
    },
)
submission_df.to_csv("./data/lightgbm.csv", index=False)

## CatBoostRegressor

In [None]:
def CBRPipeline(train_x, train_y, test_x):
    # 72732
    for col in categorical_features:
        train_x[col] = train_x[col].fillna("unknown")
        test_x[col] = test_x[col].fillna("unknown")
    reg = catboost.CatBoostRegressor(cat_features=categorical_features, random_state=9999, verbose=0)
    reg.fit(train_x, train_y)
    test_yhat = reg.predict(test_x)
    return test_yhat


def LGBMRPipeline(train_x, train_y, test_x):
    # 73006
    encoder = TargetEncoder(cols=categorical_features).set_output(transform="pandas")
    encoder.fit(train_x, train_y)
    train_x = encoder.transform(train_x)
    test_x = encoder.transform(test_x)
    # for col in categorical_features:
    #     train_x[col] = train_x[col].fillna("unknown")
    #     test_x[col] = test_x[col].fillna("unknown")
    reg = lightgbm.LGBMRegressor(random_state=9999, verbose=0)
    reg.fit(train_x, train_y)
    test_yhat = reg.predict(test_x)
    return test_yhat


score_list = []
cv = KFold(n_splits=20)
for train_idx, test_idx in tqdm(list(cv.split(range(len(train_df))))):
    cv_train_x = train_x.iloc[train_idx].copy()
    cv_train_y = train_y.iloc[train_idx].copy()
    cv_test_x = train_x.iloc[test_idx].copy()
    cv_test_y = train_y.iloc[test_idx].copy()
    cv_test_yhat = CBRPipeline(cv_train_x, cv_train_y, cv_test_x)
    # cv_test_yhat = LGBMRPipeline(cv_train_x, cv_train_y, cv_test_x)
    score_list.append(root_mean_squared_error(cv_test_y, cv_test_yhat))
np.mean(score_list)

# test_yhat = CatPipeline(train_x, train_y, test_x)
# submission_df = pd.DataFrame(
#     {
#         "id": kaggle_id,
#         "price": test_yhat,
#     },
# )
# submission_df.to_csv("./data/CBRPipeline.csv", index=False)

## AutoGluon

In [None]:
predictor = TabularPredictor(label="price", eval_metric="rmse").fit(pd.concat([train_x, train_y], axis=1))
y_pred = predictor.predict(test_x)