In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install catboost
!pip3 install category_encoders

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2
Collecting category_encoders
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.1


In [3]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.0


In [4]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.1 cmaes-0.10.0 colorlog-6.7.0 optuna-3.2.0


In [1]:
import unicodedata
from pathlib import Path

import catboost as cb
from optuna.integration import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import CountEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import TargetEncoder
import os

In [2]:

exp_name = 'exp00012'

outputs_path = f'/content/drive/MyDrive/Colab Notebooks/signate2023/{exp_name}/'

submission_dir = outputs_path + 'submissions/'
submission_path = submission_dir + f'submission_{exp_name}.csv'

model_dir = outputs_path + f'signate-models/'



for dir in [model_dir]:
    os.makedirs(dir, exist_ok=True)

In [3]:
INPUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/signate2023/")

In [4]:
train_df = pd.read_csv(INPUT_DIR / "train.csv")
test_df = pd.read_csv(INPUT_DIR / "test.csv")
sub_df = pd.read_csv(INPUT_DIR / "submit_sample.csv", names=["id", "price"])

In [5]:
import unicodedata

# Define a function to normalize manufacturer names
def normalize_manufacturer_name(name):
    # Convert full-width characters to half-width
    name = unicodedata.normalize('NFKC', name)
    # Convert to lowercase
    name = name.lower()
    # Remove non-ASCII characters
    name = ''.join(char for char in name if ord(char) < 128)
    return name


class PreProcessTransformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # cylinderから数値を取り出す
        X["cylinders"] = X["cylinders"].astype(str).str.extract("(\d+)").astype("float32")

        # sizeの表記揺れを修正
        X["size"] = X["size"].str.replace("ー", "-").astype(str)
        X["size"] = X["size"].str.replace("−", "-").astype(str)

        # Apply the function to the 'manufacturer' column
        X['manufacturer'] = X['manufacturer'].apply(normalize_manufacturer_name)


        # Define a dictionary for manufacturer name mapping with special characters
        manufacturer_name_mapping = {
            'nisan': 'nissan',
            'toyot': 'toyota',
            'nisѕan': 'nissan',
            'subαru': 'subaru',
            'subru': 'subaru',
            'sαturn': 'saturn',
            'lexu':'lexus',
            'аcura': 'acura',
            'vоlkswagen': 'volkswagen',
            'vlkswagen': 'volkswagen',
            'lexuѕ': 'lexus',
            'ᴄhrysler': 'chrysler',
            'sturn': 'saturn'
        }

        # Replace the manufacturer names using the mapping
        X['manufacturer'] = X['manufacturer'].replace(manufacturer_name_mapping)


        # yearが3000年以降のものはおかしいので2000年に置換
        err_idx = X.query("year >= 3000").index
        X.loc[err_idx, "year"] = X.loc[err_idx, "year"].apply(lambda x: x - 1000)

        # 走行距離 / 製造年
        X["odometer_per_year"] = X["odometer"] / (2023 - X["year"])

        return X


class RankTransformer(TransformerMixin, BaseEstimator):
    """keyの中でvalueが何番目のものか"""

    def __init__(self, key: str, value: str):
        self.key = key
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()
        X_new[self.key] = X_new.groupby(self.key)[self.value].rank(method="dense")
        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self, input_features=None):
        return [self.key]


class OriginalTransformer(TransformerMixin, BaseEstimator):
    """数値特徴はそのまま、カテゴリ特徴はcategory型に変換"""

    def __init__(self, numeric_cols, categorical_cols):
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()

        # 数値変数
        X_new[self.numeric_cols] = X_new[self.numeric_cols].astype("float32")

        # カテゴリ変数
        #X_new[self.categorical_cols] = X_new[self.categorical_cols].astype("category")

        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self):
        return self.numeric_cols + self.categorical_cols


class CountTransformer(TransformerMixin, BaseEstimator):
    """CountEncoder"""

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.ce = CountEncoder(cols=X.columns.tolist(), handle_unknown=0)
        self.ce.fit(X)
        return self

    def transform(self, X):
        return self.ce.transform(X)

    def get_feature_names_out(self, input_features=None):
        return input_features


class AggTransformer(TransformerMixin, BaseEstimator):
    """集約特徴量"""

    def __init__(self, key, numeric_cols, agg_func: dict):
        self.key = key
        self.numeric_cols = numeric_cols
        self.agg_func = agg_func

    def fit(self, X, y=None):
        X = X.copy()
        X[self.key] = X[self.key].astype("category")
        self.agg_df = X.groupby(self.key)[self.numeric_cols].agg(self.agg_func)
        self.agg_df.columns = [f"{col}_{func}" for col, func in self.agg_df.columns.tolist()]

        return self

    def transform(self, X):
        X_new = pd.merge(X, self.agg_df, on=self.key, how="left")
        return X_new[self.get_feature_names_out()]

    def get_feature_names_out(self, input_features=None):
        return self.agg_df.columns.tolist()

In [6]:
numeric_cols = ["year", "odometer", "odometer_per_year"]
categorical_cols = [
    'region',
    "cylinders",
    "manufacturer",
    "condition",
    "fuel",
    "title_status",
    "transmission",
    "drive",
    "size",
    "type",
    "paint_color",
    "state",
]

ct = ColumnTransformer(
    transformers=[
        (
            "ori",
            OriginalTransformer(numeric_cols, categorical_cols),
            categorical_cols + numeric_cols,
        ),
        (
            "tgt",
            TargetEncoder(target_type="continuous", random_state=88),
            [
                'region',
                "cylinders",
                "manufacturer",
                "condition",
                "fuel",
                "title_status",
                "transmission",
                "drive",
                "size",
                "type",
                "paint_color",
                "state",
            ],
        ),
        # *[
        #     (f"agg_{key}", AggTransformer(key, ["odometer"], {"mean"}), [key] + ["odometer"])
        #     for key in [
        #         "manufacturer",
        #     ]
        # ],
    ],
    verbose=True,
)
ct.set_output(transform="pandas")
pipe = Pipeline(
    steps=[
        ("preprocess", PreProcessTransformer()),
        ("ct", ct),
    ]
)
train_feat_df = pipe.fit_transform(train_df, train_df["price"])
test_feat_df = pipe.transform(test_df)


[ColumnTransformer] ........... (1 of 2) Processing ori, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing tgt, total=   0.4s


In [7]:
state_dic = {}

# データ数的に、重複は気にせず上書きしていっても大した時間にはならない
for i in train_feat_df.index:
    # ただし、欠損値で上書きされると困るので、そこだけケア
    if pd.isna(train_feat_df.at[i, 'ori__state'])==False:
        state_dic[train_feat_df.at[i, 'ori__region']] = train_feat_df.at[i, 'ori__state']

train_feat_df.loc[pd.isna(train_feat_df['ori__state']).index, 'ori__state'] = train_feat_df.loc[pd.isna(train_feat_df['ori__state']).index, 'ori__region'].map(state_dic)
test_feat_df.loc[pd.isna(test_feat_df['ori__state']).index, 'ori__state'] = test_feat_df.loc[pd.isna(test_feat_df['ori__state']).index, 'ori__region'].map(state_dic)

In [8]:
pd.set_option('display.max_columns', None)
train_feat_df.sample(5)

Unnamed: 0,ori__year,ori__odometer,ori__odometer_per_year,ori__region,ori__cylinders,ori__manufacturer,ori__condition,ori__fuel,ori__title_status,ori__transmission,ori__drive,ori__size,ori__type,ori__paint_color,ori__state,tgt__region,tgt__cylinders,tgt__manufacturer,tgt__condition,tgt__fuel,tgt__title_status,tgt__transmission,tgt__drive,tgt__size,tgt__type,tgt__paint_color,tgt__state
4312,2008.0,140936.0,9395.733398,south jersey,6.0,toyota,excellent,gas,clean,manual,fwd,mid-size,sedan,black,nj,10553.687146,14503.401593,9169.754528,14680.774136,12240.361505,13276.786034,12117.939502,9767.130897,12229.25152,9860.993469,14828.422588,12044.987594
11379,2000.0,144303.0,6274.043457,klamath falls,6.0,ford,fair,gas,clean,automatic,rwd,compact,coupe,red,or,6584.192545,14443.22462,14715.468636,7122.922775,12225.230115,13247.724494,13930.228325,15799.259522,8823.639157,11574.41218,11737.003067,13840.385034
7628,2006.0,156831.0,9225.352539,san antonio,6.0,bmw,excellent,gas,clean,manual,rwd,mid-size,convertible,blue,tx,13359.436408,14503.401593,13969.331878,14680.774136,12240.361505,13276.786034,12117.939502,15947.532658,12229.25152,13191.795845,13518.030498,12767.385565
7811,1988.0,84321.0,2409.171387,albany,8.0,mercedes-benz,excellent,gas,rebuilt,manual,rwd,full-size,sedan,black,ny,15133.458998,16300.143568,13850.516897,14585.322864,12214.504336,14725.907632,11895.073699,15681.495329,15310.042973,9956.111283,14685.067166,14676.046059
25041,2007.0,199476.0,12467.25,new hampshire,8.0,ford,excellent,gas,clean,automatic,4wd,full-size,SUV,black,nh,12008.54425,16555.010161,14822.248384,14745.981461,12282.452422,13371.433337,14025.651486,18729.655054,15504.86121,13837.0677,14906.634062,11864.739234


In [9]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to apply label encoding
categorical_cols = ['ori__region', 'ori__manufacturer', 'ori__condition', 'ori__fuel', 'ori__title_status',
                    'ori__transmission', 'ori__drive', 'ori__size', 'ori__type', 'ori__paint_color', 'ori__state']

# Create a copy of the dataframes
train_data_encoded = train_feat_df.copy()
test_data_encoded = test_feat_df.copy()

# Instantiate the LabelEncoder
le = LabelEncoder()

# Apply label encoding and missing value imputation
for col in categorical_cols:
    # Fill missing values with 'unknown'
    #train_data_encoded[col].fillna('unknown', inplace=True)
    #test_data_encoded[col].fillna('unknown', inplace=True)

    # Apply label encoding
    le.fit(pd.concat([train_data_encoded[col], test_data_encoded[col]], axis=0))
    train_data_encoded[col] = le.transform(train_data_encoded[col])
    test_data_encoded[col] = le.transform(test_data_encoded[col])

In [10]:
train_feat_df = train_data_encoded
test_feat_df = test_data_encoded

In [11]:
test_feat_df

Unnamed: 0,ori__year,ori__odometer,ori__odometer_per_year,ori__region,ori__cylinders,ori__manufacturer,ori__condition,ori__fuel,ori__title_status,ori__transmission,ori__drive,ori__size,ori__type,ori__paint_color,ori__state,tgt__region,tgt__cylinders,tgt__manufacturer,tgt__condition,tgt__fuel,tgt__title_status,tgt__transmission,tgt__drive,tgt__size,tgt__type,tgt__paint_color,tgt__state
0,2015.0,92553.0,11569.125000,357,4.0,7,0,2,0,0,1,1,0,8,5,12364.873861,10553.046813,14994.074493,14662.314415,12251.735036,13276.956914,13944.41638,9810.374435,15394.953240,13821.072917,11812.789531,13726.870953
1,2013.0,134385.0,13438.500000,271,4.0,29,3,2,5,0,1,2,9,0,37,11298.195115,10553.046813,10005.818945,16510.276020,12251.735036,12352.004779,13944.41638,9810.374435,12158.496396,9939.803859,14798.576380,13692.255137
2,2011.0,102489.0,8540.750000,2,4.0,37,2,2,0,0,1,1,9,0,35,12559.078133,10553.046813,8002.268829,11022.863826,12251.735036,13276.956914,13944.41638,9810.374435,15394.953240,9939.803859,14798.576380,11788.498453
3,2016.0,64310.0,9187.142578,74,6.0,19,0,0,0,0,0,2,0,8,5,12608.590272,14480.395637,15060.984391,14662.314415,24795.796686,13276.956914,13944.41638,18646.321078,12158.496396,13821.072917,11812.789531,12496.747029
4,1999.0,180839.0,7534.958496,134,8.0,14,0,2,4,0,0,2,0,9,27,12725.177128,16443.200833,8995.586312,14662.314415,12251.735036,14822.843241,13944.41638,18646.321078,12158.496396,13821.072917,11324.760677,12982.801388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27532,2016.0,90902.0,12986.000000,121,6.0,13,0,2,4,0,1,1,10,0,26,11398.227793,14480.395637,16426.590358,14662.314415,12251.735036,14822.843241,13944.41638,9810.374435,15394.953240,20264.292408,14798.576380,15315.070234
27533,2012.0,27234.0,2475.818115,174,4.0,29,0,2,4,0,1,2,9,10,33,14288.625891,10553.046813,10005.818945,14662.314415,12251.735036,14822.843241,13944.41638,9810.374435,12158.496396,9939.803859,16883.629075,14329.225195
27534,2002.0,99761.0,4750.523926,251,6.0,4,0,2,0,0,2,1,3,1,3,15374.316389,14480.395637,13968.168771,14662.314415,12251.735036,13276.956914,13944.41638,15846.195755,15394.953240,11655.045752,13542.523811,13346.664027
27535,2006.0,162279.0,9545.823242,60,6.0,4,0,2,0,0,0,2,9,9,5,13390.821242,14480.395637,13968.168771,14662.314415,12251.735036,13276.956914,13944.41638,18646.321078,12158.496396,9939.803859,11324.760677,12496.747029


In [12]:
def get_cv(df, n_fold=5):
    kf = KFold(n_splits=n_fold, shuffle=True, random_state=71)
    return list(kf.split(df))

# ref: https://www.guruguru.science/competitions/16/discussions/185c7dc6-5e3a-49c6-9c30-41bf007cc694/
def fit_lgbm(X, y, cv, categorical_cols: list = None, params: dict = None, verbose: int = 50):
    # パラメータがないときは、空の dict で置き換える
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records,), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        train_data = lgb.Dataset(data=x_train, label=y_train)
        test_data = lgb.Dataset(data=x_valid, label=y_valid)

        model = lgb.train(train_set=train_data,
            params=params,
            valid_sets=[train_data, test_data],
            categorical_feature=[],
            valid_names=['Train', 'Test'],
            early_stopping_rounds=5,
            verbose_eval=verbose,
        )

        pred_i = model.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(model)
        score = mean_absolute_percentage_error(y_valid, pred_i)
        print(f" - fold{i + 1} - {score:.4f}")

    score = mean_absolute_percentage_error(y, oof_pred)

    print("=" * 50)
    print(f"FINISHI: Whole Score: {score:.4f}")
    return score, oof_pred, models


def fit_cat(X, y, cv, categorical_cols: list = None, params: dict = None, verbose: int = 50):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records,), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = cb.CatBoostRegressor(**params)
        model.fit(
            x_train,
            y_train,
            cat_features=categorical_cols,
            eval_set=[(x_valid, y_valid)],
            use_best_model=True,
            early_stopping_rounds=100,
            verbose=verbose,
        )
        pred_i = model.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(model)
        score = mean_absolute_percentage_error(y_valid, pred_i)
        print(f" - fold{i + 1} - {score:.4f}")

    score = mean_absolute_percentage_error(y, oof_pred)

    print("=" * 50)
    print(f"FINISHI: Whole Score: {score:.4f}")
    return score, oof_pred, models


# optunaを使ったパラメータチューニング
# def tuning(train_feat_df, train_df, cv):
#     def objective(trial):
#         max_depth = trial.suggest_int("max_depth", 1, 10)
#         num_leaves = trial.suggest_int("num_leaves", 2, 2**max_depth)
#         colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.1, 1.0)
#         subsample = trial.suggest_uniform("subsample", 0.1, 1.0)

#         params = {
#             "objective": "mape",
#             "n_estimators": 10000,
#             "learning_rate": 0.05,
#             "max_depth": max_depth,
#             "num_leaves": num_leaves,
#             "colsample_bytree": colsample_bytree,
#             "subsample": subsample,
#             "metric": "mape",
#             "importance_type": "gain",
#             "random_state": 88,
#         }

#         score, _, _ = fit_lgbm(
#             train_feat_df, train_df["price"], cv=cv, categorical_cols=[], params=params, verbose=-1
#         )
#         return score

#     study = optuna.create_study(direction="minimize")
#     study.optimize(objective, n_trials=100)
#     print("Number of finished trials:", len(study.trials))
#     print("Best trial:", study.best_trial.params)
#     return study.best_trial.params


def visualize_importance(models, feat_train_df):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_
        _df["column"] = feat_train_df.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = (
        feature_importance_df.groupby("column")
        .sum()[["feature_importance"]]
        .sort_values("feature_importance", ascending=False)
        .index[:50]
    )

    fig, ax = plt.subplots(figsize=(12, max(6, len(order) * 0.25)))
    sns.boxenplot(
        data=feature_importance_df,
        x="feature_importance",
        y="column",
        order=order,
        ax=ax,
        palette="viridis",
        orient="h",
    )
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax


def visualize_oof_gt(oof, gt):
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(oof, gt, alpha=0.5)
    gt_max = gt.max()
    ax.plot(np.arange(0, gt_max), np.arange(0, gt_max), color="red", alpha=0.5, linestyle="--")
    ax.set_xlabel("Out Of Fold")
    ax.set_ylabel("Ground Truth")
    ax.grid()
    ax.legend()
    fig.tight_layout()

    fig, ax


def visualize_oof_pred(oof, pred):
    fig, ax = plt.subplots(figsize=(8, 6))

    bins = 100
    ax.hist(pred, bins=bins, density=True, alpha=0.5, label="Test")
    ax.hist(oof, bins=bins, density=True, alpha=0.5, label="OutOfFold")
    ax.grid()
    ax.legend()
    fig.tight_layout()

    fig, ax

In [13]:
n_fold = 5
lgbm_params = {
    "objective": "mape",  # または "regression", "regression_l1", "regression_l2" など
    "metric": "mape",  # または "l1", "l2", "rmse" など
    'deterministic':True,  # Ensure reproducibility
    'force_row_wise':True,  # Ensure reproducibility
    "random_state": 42,
    "verbosity": -1,
}

feat_cat_cols = train_feat_df.select_dtypes(include="category").columns.tolist()

cv = get_cv(train_feat_df, n_fold=5)

# training
score, oof, models = fit_lgbm(
    train_feat_df,
    y=train_df["price"],
    categorical_cols=[],
    params=lgbm_params,
    cv=cv,
    verbose=False,
)

[I 2023-08-01 14:29:36,328] A new study created in memory with name: no-name-0942fff9-2809-43f3-ba7f-d885130773c4
New categorical_feature is []
feature_fraction, val_score: 0.457018:  14%|#4        | 1/7 [00:07<00:43,  7.26s/it][I 2023-08-01 14:29:43,622] Trial 0 finished with value: 0.45701761322180884 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.45701761322180884.
New categorical_feature is []
feature_fraction, val_score: 0.455955:  29%|##8       | 2/7 [00:11<00:27,  5.54s/it][I 2023-08-01 14:29:47,960] Trial 1 finished with value: 0.45595512287053386 and parameters: {'feature_fraction': 0.4}. Best is trial 1 with value: 0.45595512287053386.
New categorical_feature is []
feature_fraction, val_score: 0.455955:  43%|####2     | 3/7 [00:15<00:19,  4.96s/it][I 2023-08-01 14:29:52,229] Trial 2 finished with value: 0.4560864443049583 and parameters: {'feature_fraction': 0.5}. Best is trial 1 with value: 0.45595512287053386.
New categorical_feature is []
feature_

 - fold1 - 0.4512


New categorical_feature is []
feature_fraction, val_score: 0.445176:  14%|#4        | 1/7 [00:00<00:05,  1.03it/s][I 2023-08-01 14:31:41,097] Trial 0 finished with value: 0.4451760425023963 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.4451760425023963.
New categorical_feature is []
feature_fraction, val_score: 0.445137:  29%|##8       | 2/7 [00:01<00:04,  1.10it/s][I 2023-08-01 14:31:41,966] Trial 1 finished with value: 0.4451370255358589 and parameters: {'feature_fraction': 0.8}. Best is trial 1 with value: 0.4451370255358589.
New categorical_feature is []
feature_fraction, val_score: 0.445137:  43%|####2     | 3/7 [00:02<00:03,  1.19it/s][I 2023-08-01 14:31:42,758] Trial 2 finished with value: 0.446668663734149 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 1 with value: 0.4451370255358589.
New categorical_feature is []
feature_fraction, val_score: 0.444466:  57%|#####7    | 4/7 [00:04<00:03,  1.19s/it][I 2023-08-01 14:31:44,438] T

 - fold2 - 0.4407


New categorical_feature is []
feature_fraction, val_score: 0.448991:  14%|#4        | 1/7 [00:00<00:05,  1.06it/s][I 2023-08-01 14:33:22,587] Trial 0 finished with value: 0.44899077592511294 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.44899077592511294.
New categorical_feature is []
feature_fraction, val_score: 0.448991:  29%|##8       | 2/7 [00:01<00:04,  1.18it/s][I 2023-08-01 14:33:23,374] Trial 1 finished with value: 0.4512426839561626 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.44899077592511294.
New categorical_feature is []
feature_fraction, val_score: 0.448991:  43%|####2     | 3/7 [00:02<00:03,  1.08it/s][I 2023-08-01 14:33:24,386] Trial 2 finished with value: 0.44979278138308926 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.44899077592511294.
New categorical_feature is []
feature_fraction, val_score: 0.448384:  57%|#####7    | 4/7 [00:03<00:02,  1.06it/s][I 2023-08-01 14:33:25,368] Trial 3 fi

 - fold3 - 0.4467


New categorical_feature is []
feature_fraction, val_score: 0.449511:  14%|#4        | 1/7 [00:00<00:05,  1.09it/s][I 2023-08-01 14:34:49,995] Trial 0 finished with value: 0.44951137130861946 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.44951137130861946.
New categorical_feature is []
feature_fraction, val_score: 0.449511:  29%|##8       | 2/7 [00:01<00:04,  1.19it/s][I 2023-08-01 14:34:50,786] Trial 1 finished with value: 0.4529672777194361 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.44951137130861946.
New categorical_feature is []
feature_fraction, val_score: 0.448664:  43%|####2     | 3/7 [00:02<00:03,  1.15it/s][I 2023-08-01 14:34:51,693] Trial 2 finished with value: 0.4486636790808909 and parameters: {'feature_fraction': 0.5}. Best is trial 2 with value: 0.4486636790808909.
New categorical_feature is []
feature_fraction, val_score: 0.448664:  57%|#####7    | 4/7 [00:03<00:02,  1.15it/s][I 2023-08-01 14:34:52,551] Trial 3 fini

 - fold4 - 0.4480


New categorical_feature is []
feature_fraction, val_score: 0.449232:  14%|#4        | 1/7 [00:00<00:04,  1.35it/s][I 2023-08-01 14:36:19,038] Trial 0 finished with value: 0.4492315630861865 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.4492315630861865.
New categorical_feature is []
feature_fraction, val_score: 0.446011:  29%|##8       | 2/7 [00:05<00:15,  3.04s/it][I 2023-08-01 14:36:23,696] Trial 1 finished with value: 0.446011075168148 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 1 with value: 0.446011075168148.
New categorical_feature is []
feature_fraction, val_score: 0.446011:  43%|####2     | 3/7 [00:06<00:08,  2.07s/it][I 2023-08-01 14:36:24,617] Trial 2 finished with value: 0.4460676179500385 and parameters: {'feature_fraction': 0.6}. Best is trial 1 with value: 0.446011075168148.
New categorical_feature is []
feature_fraction, val_score: 0.445331:  57%|#####7    | 4/7 [00:07<00:05,  1.70s/it][I 2023-08-01 14:36:25,732] Tri

 - fold5 - 0.4419
FINISHI: Whole Score: 0.4457


In [15]:
# inference
pred = np.array([model.predict(test_feat_df) for model in models])
pred = np.mean(pred, axis=0)

# submission
sub_df["price"] = pred
sub_df.to_csv(f'{model_dir}/submission.csv', index=False, header=False)

In [14]:
visualize_importance(models, train_feat_df)
visualize_oof_gt(oof, train_df['price'])
visualize_oof_pred(oof, pred)

AttributeError: ignored