In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.3-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.3 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [1]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import time


from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error
from tqdm.auto import tqdm

import warnings
warnings.simplefilter("ignore")

import unicodedata
import lightgbm as lgb

import optuna
import tensorflow as tf
from sklearn.metrics import mean_absolute_percentage_error
from tensorflow.keras.layers import Dense, PReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    n_trials = 3000
    num_pred = 2 #予測の数を指定、今回はexp38と39の2つ
    hidden_size = 4
    dropout = 0.2

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/train.csv')

In [5]:
# ===================================================================
#  DataLoading
# ===================================================================
df_1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp00038/signate-models/exp38_oof_pred.csv').rename(columns={"oof_pred":"pred_1"})
df_2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp00039/signate-models/exp39_oof_pred.csv').rename(columns={"oof_pred":"pred_2"})
df = pd.concat([df_train['id'], df_1, df_2, df_train['price']], axis=1)
test_1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp00038/signate-models/submission.csv', header=None).rename(columns={0:"id", 1:"pred_1"})
test_2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp00039/signate-models/submission.csv', header=None).rename(columns={0:"id", 1:"pred_2"})
test = test_1.merge(test_2, on='id')

In [6]:
def mape_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    diff = tf.abs((y_true - y_pred) / tf.clip_by_value(tf.abs(y_true), 1e-9, float("inf")))
    return 100. * tf.reduce_mean(diff)

In [7]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)  # ここでシード値を設定

In [8]:
# 交差検証関数の定義
def get_custom_cv(df, n_splits):
    df = df.sort_values(by="price", ignore_index=True)
    df["fold"] = [i for i in range(n_splits)] * (df.shape[0] // n_splits) \
                + [i for i in range(df.shape[0] % n_splits)]
    df = df.sort_values(by="id", ignore_index=True)

    for fold in range(n_splits):
        train_idx = df[df["fold"] != fold].index
        valid_idx = df[df["fold"] == fold].index
        yield train_idx, valid_idx

# CVの設定
n_splits = 8
cv = list(get_custom_cv(df, n_splits))

In [9]:
# 1. データセットの作成
X = df[['pred_1', 'pred_2']].values
y = df['price'].values

In [10]:
#一つ目のモデル

In [11]:
# テストデータの特徴量の取得
test_features = test[['pred_1', 'pred_2']].values

# 各フォールドでのテストデータの予測を格納するリスト
test_preds = []

# OOF (Out Of Fold) predictions
oof_preds = np.zeros_like(y)

for fold, (train_idx, valid_idx) in enumerate(cv):
    print(f"Fold {fold + 1}")

    # データの取得
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    # モデルの作成と学習
    model = Sequential([
        Dense(CFG.hidden_size*3, input_dim=CFG.num_pred),
        PReLU(),
        Dense(CFG.hidden_size*2),
        PReLU(),
        Dense(CFG.hidden_size),
        PReLU(),
        Dense(1)
    ])

    model.compile(optimizer=Adam(), loss=mape_loss)

    # EarlyStoppingコールバックの定義
    early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1, restore_best_weights=True)

    model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=50, batch_size=32, verbose=1, callbacks=[early_stopping])

    # OOF predictions
    oof_preds[valid_idx] = model.predict(X_valid).reshape(-1)

    print(f"FINISHI: fold{fold} Score: {mean_absolute_percentage_error(y_valid, oof_preds[valid_idx]):.4f}")

    # テストデータの予測
    test_pred = model.predict(test_features).reshape(-1)
    test_preds.append(test_pred)

# Calculate overall OOF score
oof_score = mean_absolute_percentage_error(y, oof_preds)
print("=" * 50)
print(f"FINISHI: Whole OOF Score: {oof_score:.4f}")


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
FINISHI: fold0 Score: 0.4366
Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epo

In [None]:
hidden_size = 8
FINISHI: Whole OOF Score: 0.4404

hidden_size = 4
FINISHI: Whole OOF Score: 0.4404

In [None]:
#二つ目のモデル、こっちはあんまり期待できなさそうかも？

In [None]:
'''
from keras.models import Sequential
from keras.layers import Dense, PReLU, BatchNormalization, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model(cfg):
    model = Sequential([
        Dense(cfg.hidden_size, input_dim=CFG.num_pred),
        BatchNormalization(),
        Dropout(cfg.dropout),
        PReLU(),
        Dense(cfg.hidden_size),
        BatchNormalization(),
        Dropout(cfg.dropout),
        PReLU(),
        Dense(1)
    ])
    return model

# テストデータの特徴量の取得
test_features = test[['pred_1', 'pred_2']].values

# 各フォールドでのテストデータの予測を格納するリスト
test_preds = []

# OOF (Out Of Fold) predictions
oof_preds = np.zeros_like(y)

for fold, (train_idx, valid_idx) in enumerate(cv):
    print(f"Fold {fold + 1}")

    # データの取得
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    # モデルの作成
    model = build_model(CFG)

    model.compile(optimizer=Adam(), loss=mape_loss)

    # EarlyStoppingコールバックの定義
    early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1, restore_best_weights=True)

    model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=50, batch_size=32, verbose=1, callbacks=[early_stopping])

    # OOF predictions
    oof_preds[valid_idx] = model.predict(X_valid).reshape(-1)

    print(f"FINISHI: fold{fold} Score: {mean_absolute_percentage_error(y_valid, oof_preds[valid_idx]):.4f}")

    # テストデータの予測
    test_pred = model.predict(test_features).reshape(-1)
    test_preds.append(test_pred)

# Calculate overall OOF score
oof_score = mean_absolute_percentage_error(y, oof_preds)
print("=" * 50)
print(f"FINISHI: Whole OOF Score: {oof_score:.4f}")
'''

In [None]:
FINISHI: Whole OOF Score: 0.4452

In [None]:
#これは共通

In [13]:
# すべてのフォールドの予測の平均を取る
test["pred"] = np.mean(test_preds, axis=0)

test[["id", "pred"]].to_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp38_39_submission_stacking_cv.csv', index=False, header=None)