In [1]:
import catboost
import lightgbm
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import xgboost
from category_encoders.target_encoder import TargetEncoder
from optuna.samplers import TPESampler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import make_scorer, root_mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RepeatedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from ydata_profiling import ProfileReport

In [12]:
train_df = pd.read_csv("./data/train.csv")
train_df["dataset"] = "train"
test_df = pd.read_csv("./data/test.csv")
test_df["dataset"] = "test"
kaggle_id = test_df["id"].copy()

def extract_trasnmission(x):
    if "A/T" in x or "AT" in x or "Automatic" in x:
        return "Automatic"
    elif "Mt" in x or "M/T" in x or "Manual" in x or "Transmission w/Dual Shift Mode" in x:
        return "Manual"
    else:
        return x

def preprocess_rawdata(df):
    df = df.copy()

    df.loc[df["brand"] == "Tesla", "fuel_type"] = "electricity"

    df["age"] = 2024 - df["model_year"]

    df["milage_1"] = df["milage"] // df["age"]
    df.loc[df["milage_1"] == np.inf, "milage_1"] = 0.0

    df.replace("-", np.nan, inplace=True)
    df["transmission_1"] = df["transmission"].map(extract_trasnmission)
    return df


categorical_features = ["brand", "model", "fuel_type", "accident", "transmission_1"]  # , "transmission" "clean_title"
numeric_features = ["age", "milage", "milage_1"]
y = "price"

train_df = preprocess_rawdata(train_df)
test_df = preprocess_rawdata(test_df)

train_x = train_df[categorical_features + numeric_features].copy()
train_y = train_df[y].copy()
test_x = test_df[categorical_features + numeric_features].copy()

def MyPipeline(train_x, train_y, test_x, test_y=None):
    encoder = TargetEncoder(cols=categorical_features).set_output(transform="pandas")
    encoder.fit(train_x, train_y)
    train_x = encoder.transform(train_x)
    test_x = encoder.transform(test_x)

    scaler = StandardScaler().set_output(transform="pandas")
    scaler.fit(train_x, train_y)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    assert isinstance(train_x, pd.DataFrame)
    assert isinstance(test_x, pd.DataFrame)

    # xgboost.XGBRegressor
    # 75287
    xgbr = xgboost.XGBRegressor(random_state=9999)
    xgbr.fit(train_x, train_y)
    xgbr_test_yhat = xgbr.predict(test_x)
    # xgbr_score = root_mean_squared_error(cv_test_y, xgbr_test_yhat)

    # lightgbm.LGBMRegressor
    # 73042
    lgbmr = lightgbm.LGBMRegressor(random_state=9999, verbose=0)
    lgbmr.fit(train_x, train_y)
    lgbmr_test_yhat = lgbmr.predict(test_x)
    # lgbmr_score = root_mean_squared_error(cv_test_y, lgbmr_test_yhat) 

    # catboost.CatBoostRegressor
    # 74042
    catbr = catboost.CatBoostRegressor(random_state=9999, verbose=0)
    catbr.fit(train_x, train_y)
    catbr_test_yhat = catbr.predict(test_x)
    # catbr_score = root_mean_squared_error(cv_test_y, catbr_test_yhat)

    # Voting
    voting_test_yhat = (xgbr.predict(test_x) * 75287 + lgbmr.predict(test_x) * 73042 + catbr.predict(test_x) * 74042) / (75287 + 73042 + 74042)
    # voting_score = root_mean_squared_error(cv_test_y, voting_test_yhat)
    
    # Stacking
    meta_train_features = pd.DataFrame(
        {
            "xgboost.XGBRegressor": xgbr.predict(train_x),
            "lightgbm.LGBMRegressor": lgbmr.predict(train_x),
            "catboost.CatBoostRegressor": catbr.predict(train_x),
        }
    )
    meta_train_x = pd.concat(
        [
            train_x.reset_index(drop=True),
            meta_train_features,
        ],
        axis=1,
    )
    # meta_learner = catboost.CatBoostRegressor(random_state=9999, verbose=0)
    # meta_learner = xgboost.XGBRegressor(random_state=9999)
    meta_learner = Lasso(random_state=9999)
    meta_learner.fit(meta_train_x, train_y)
    meta_test_features = pd.DataFrame(
        {
            "xgboost.XGBRegressor": xgbr.predict(test_x),
            "lightgbm.LGBMRegressor": lgbmr.predict(test_x),
            "catboost.CatBoostRegressor": catbr.predict(test_x),
        }
    )
    meta_test_x = pd.concat(
        [
            test_x.reset_index(drop=True),
            meta_test_features,
        ],
        axis=1,
    )
    meta_test_yhat = meta_learner.predict(meta_test_x)
    # meta_score = root_mean_squared_error(cv_test_y, meta_test_yhat)

    return pd.DataFrame({
        "xgboost.XGBRegressor": xgbr_test_yhat,
        "lightgbm.LGBMRegressor": lgbmr_test_yhat,
        "catboost.CatBoostRegressor": catbr_test_yhat,
        "meataLearner": meta_test_yhat,
        "voting": voting_test_yhat,
    })

score_list = []
cv = KFold(n_splits=20)
for train_idx, test_idx in tqdm(list(cv.split(range(len(train_df))))):
    cv_train_x = train_x.iloc[train_idx]
    cv_train_y = train_y.iloc[train_idx]
    cv_test_x = train_x.iloc[test_idx]
    cv_test_y = train_y.iloc[test_idx]
    model_pred_df = MyPipeline(cv_train_x, cv_train_y, cv_test_x)
    score_list.append({col: root_mean_squared_error(cv_test_y, model_pred_df[col]) for col in model_pred_df.columns})
pd.DataFrame(score_list).mean(axis=0)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [03:20<00:00, 10.01s/it]


NameError: name 'model_score_df' is not defined

In [18]:
model_pred_df = MyPipeline(train_x, train_y, test_x)
preds = model_pred_df["lightgbm.LGBMRegressor"]
submission_df = pd.DataFrame(
    {
        "id": kaggle_id,
        "price": preds,
    },
)
submission_df.to_csv("./data/submission.csv", index=False)

In [None]:
def pipeline(train_x, train_y, test_x, params):
    encoder = TargetEncoder(verbose=True, cols=categorical_features)
    encoder.fit(train_x, train_y)
    # train
    encoded_train_arr = encoder.transform(train_x)
    train_x = pd.concat(
        [
            pd.DataFrame(encoded_train_arr, columns=categorical_features),
            train_x[numeric_features],
        ],
        axis=1,
    )
    # test
    encoded_test_arr = encoder.transform(test_x)
    test_x = pd.concat(
        [
            pd.DataFrame(encoded_test_arr, columns=categorical_features),
            test_x[numeric_features],
        ],
        axis=1,
    )

    scalar = StandardScaler()
    scalar.fit(train_x, train_y)
    # train
    train_x_arr = scalar.transform(train_x)
    train_x = pd.DataFrame(train_x_arr, columns=categorical_features + numeric_features)
    # test
    test_x_arr = scalar.transform(test_x)
    test_x = pd.DataFrame(test_x_arr, columns=categorical_features + numeric_features)

    model = GradientBoostingRegressor(**params)
    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    return pred


def objective(trial):
    params = {
        "random_state": 9999,
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.9),
        "n_estimators": trial.suggest_int("n_estimators", 10, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
    }
    rmse_list = []
    kfold = RepeatedKFold(n_splits=10, n_repeats=1, random_state=9999)
    for train_idx, test_idx in tqdm(list(kfold.split(train_df.index))):
        cv_train_x = train_x.iloc[train_idx]
        cv_train_y = train_y.iloc[train_idx]
        cv_test_x = train_x.iloc[test_idx]
        cv_test_y = train_y.iloc[test_idx]
        cv_test_yhat = pipeline(train_x=cv_train_x, train_y=cv_train_y, test_x=cv_test_x, params=params)
        rmse = root_mean_squared_error(cv_test_y, cv_test_yhat)
        rmse_list.append(rmse)
    return np.mean(rmse_list)


sampler = TPESampler(seed=10)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100)

In [None]:
# mypipeline = MyPipeline(train_x, train_y, test_x)
# mypipeline.run(target_encoder_options={"target_stats": "median"})
# test_y_hat = mypipeline.final_preds

In [None]:
submission_df = pd.DataFrame(
    {
        "id": test_df["id"],
        "price": test_y_hat,
    },
)
submission_df.to_csv("./data/submission.csv", index=False)