In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


from math import sqrt

import holidays

import warnings
warnings.filterwarnings(action='ignore') 

### Declare Global Variables

In [2]:
DATA_PATH  = './data/'
MODEL_PATH = './models/'
SUBMISSION_PATH = './submission/'

TRAIN_SET = 'train.csv'
TEST_SET  = 'test.csv'

### Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

### Load Data

In [4]:
train_df = pd.read_csv(DATA_PATH + TRAIN_SET)
test_df = pd.read_csv(DATA_PATH + TEST_SET)

### Data Pre-Processing

In [5]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [6]:
# 공휴일 / 일요일 / Supply = 0 / Price = 0 인 항목 제외
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp']  = pd.to_datetime(test_df['timestamp'])

kr_holidays = holidays.KR()
train_df['is_holiday'] = train_df.timestamp.apply(lambda x: 0 if x in kr_holidays else 1)
#train_df['day_of_week'] = train_df['timestamp'].dt.day_name()

test_df['is_holiday'] = test_df.timestamp.apply(lambda x: 0 if x in kr_holidays else 1)
#test_df['day_of_week'] = test_df['timestamp'].dt.day_name()

train_df = train_df[(train_df['supply(kg)']!=0) & (train_df['price(원/kg)']!=0) & (train_df['is_holiday'] != 0)]
test_df = test_df[(test_df['is_holiday'] != 0)]


In [7]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)', 'is_holiday'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])

In [8]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']#, 'day_of_week']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


### Train - Test Set Seperation

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [22]:
!pip3 install 'pycaret[full]'

Defaulting to user installation because normal site-packages is not writeable
Collecting pycaret[full]
  Using cached pycaret-3.1.0-py3-none-any.whl.metadata (16 kB)
Collecting numpy<1.24,>=1.21 (from pycaret[full])
  Using cached numpy-1.23.5-cp39-cp39-macosx_11_0_arm64.whl (13.4 MB)
Collecting pandas<2.0.0,>=1.3.0 (from pycaret[full])
  Using cached pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl (11.0 MB)
Collecting scipy~=1.10.1 (from pycaret[full])
  Using cached scipy-1.10.1-cp39-cp39-macosx_12_0_arm64.whl (28.9 MB)
Collecting scikit-learn<1.3.0,>=1.0 (from pycaret[full])
  Using cached scikit_learn-1.2.2-cp39-cp39-macosx_12_0_arm64.whl (8.5 MB)
Collecting pyod>=1.0.8 (from pycaret[full])
  Using cached pyod-1.1.1-py3-none-any.whl
Collecting imbalanced-learn>=0.8.1 (from pycaret[full])
  Using cached imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting category-encoders>=2.4.0 (from pycaret[full])
  Using cached category_encoders-2.6.3-py2.py3-none-any.whl.metadata 

In [18]:
!brew install libomp

Running `brew update --auto-update`...
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
abi3audit           dockly              libdicom            saf-cli
auditwheel          eatmemory           libnghttp3          scilla
badkeys             favirecon           mentat              sigstore
bashunit            ghc@9.4             netlistsvg          snyk-cli
bob                 git-mediate         node@20             ssh-mitm
cariddi             gitsign             nvimpager           sshportal
cf2tf               gossip              opentofu            three-body
chainloop-cli       gotpm               pan                 uvicorn
changie             gptline             patch-package       vulsio-gost
cloudsplaining      haiti               pciutils            whisper-cpp
crunchy-cli         incus               perl-xml-parser     wormhole-william
csprecon            jprq                python-argcomplete  x

In [21]:
!pip3 install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3
    Uninstalling pip-23.3:
      Successfully uninstalled pip-23.3
[0mSuccessfully installed pip-23.3.1
[0m

### Regression Model Fit

In [10]:
model = RandomForestRegressor()

# K-fold 교차 검증 적용 (K=5)
k = 10  # K-fold 교차 검증의 K 값 (예를 들어, 5-fold)
best_score = 99999999
best_model = 0
scores = cross_val_score(model, X_train, y_train, cv=k, scoring='neg_mean_squared_error')

# 각 폴드에서의 음수 MSE를 양수로 변환하고 RMSE 계산
rmse_scores = [sqrt(-score) for score in scores]

# 각 폴드에서의 RMSE 출력
for i, rmse in enumerate(rmse_scores):
    print(f'Fold {i+1} RMSE: {rmse}')
    if rmse < 

# 전체 폴드에서의 평균 RMSE 출력
mean_rmse = sum(rmse_scores) / len(rmse_scores)
print(f'Mean RMSE: {mean_rmse}')

Fold 1 RMSE: 667.5716034037439
Fold 2 RMSE: 706.5351358686964
Fold 3 RMSE: 685.881911970697
Fold 4 RMSE: 659.2746850221835
Fold 5 RMSE: 630.8269884041804
Fold 6 RMSE: 671.9678178251972
Fold 7 RMSE: 666.3557112312494
Fold 8 RMSE: 663.0323736728076
Fold 9 RMSE: 658.3479330751596
Fold 10 RMSE: 622.3743383226218
Mean RMSE: 663.2168498796538


In [11]:
train_pred = model.predict(X_train)
valid_pred = model.predict(X_test)

train_rmse = sqrt(mean_squared_error(y_train, train_pred))
valid_rmse = sqrt(mean_squared_error(y_test, valid_pred))

print(f"train rmse : {train_rmse}")
print(f"valid rmse : {valid_rmse}")

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### Inference

In [20]:
preds = []
for i in range(test_x.shape[0]):
    if test_x['is_holiday'][i] == 0:
        preds.append(0)
    else:
        pred = model.predict(test_x.iloc[i:i+1,:])
        preds.append(pred[0])

### Submission

In [21]:
submission = pd.read_csv(DATA_PATH+'./sample_submission.csv')
submission['answer'] = preds

submission.to_csv(SUBMISSION_PATH+'./my_submission.csv', index=False)