reference : https://dacon.io/competitions/official/236093/codeshare/8182?page=1&dtype=recent

In [16]:
import h2o
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [27]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [28]:
train = train.drop('ID', axis = 1)
test = test.drop('ID', axis = 1)

In [29]:
ordinal_features = ['브랜드', '차량모델명', '판매도시', '판매구역']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고유값을 확인후 test 데이터를 변환합니다.
    # Data Leakage를 발생시키지 않기 위함이니, 반드시 주의해주세요.
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [22]:
h2o.init()

train = h2o.H2OFrame(train)
x = train.columns
y = "가격"
x.remove(y)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 21 mins
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 17 days
H2O_cluster_name:,H2O_from_python_jjaegii_72nk7e
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.457 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [23]:
from h2o.automl import H2OAutoML

aml = H2OAutoML(
    max_models=10,
    seed=42,
    # max_runtime_secs=360,
    sort_metric='MAE'
)

aml.train(
    x=x,
    y=y,
    training_frame=train
)

leaderboard = aml.leaderboard
print(leaderboard.head())

AutoML progress: |

███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                    mae     rmse      mse       rmsle    mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_2_20230615_13302     5.80048  8.95418  80.1773    0.180526                   80.1773
StackedEnsemble_BestOfFamily_1_AutoML_2_20230615_13302  5.82179  8.99817  80.9671    0.181202                   80.9671
GBM_4_AutoML_2_20230615_13302                           5.901    9.09433  82.7068    0.184238                   82.7068
GBM_3_AutoML_2_20230615_13302                           5.95331  9.09871  82.7865    0.18552                    82.7865
GBM_2_AutoML_2_20230615_13302                           5.98755  9.14741  83.6752    0.187097                   83.6752
GBM_1_AutoML_2_20230615_13302                           6.02091  9.24814  85.528     0.188452                   85.528
DRF_1_AutoML_2_20230615_13302                           6.11065  9.43902  89.0952   

In [24]:
# 모델 저장
model_path = h2o.save_model(model=aml.leader, path="model/LabelEncoder", force=True)

In [25]:
# 모델 로드
model = h2o.load_model(path=model_path)

### Leader model의 요약 정보와 파라미터 출력

In [31]:
test = h2o.H2OFrame(test)
pred = model.predict(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [32]:
pred_df = pd.DataFrame(pred.as_data_frame())

In [33]:
import pandas as pd

submission = pd.read_csv('sample_submission.csv')
submission['가격'] = pred_df
submission.to_csv('submit/H2O_submit_LabelEncoder.csv', index = False)