In [7]:
import os
import random
import numpy as np
import pandas as pd
from datetime import datetime
from holidays import country_holidays
from xgboost import XGBRegressor 
import xgboost as xgb 
from tqdm import tqdm

import optuna 
from optuna.samplers import TPESampler

from supervised.automl import AutoML

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn import base
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [8]:
train_df = pd.read_csv('../../DATA/train.csv')
test_df  = pd.read_csv('../../DATA/test.csv')

In [9]:
# 데이터 타입 변경, 열 이름 변경 

new_column_names = {
    'corporation': 'corp',
    'location': 'loc',
    'supply(kg)': 'supply',
    'price(원/kg)': 'price',
}
train_df = train_df.rename(columns=new_column_names)
test_df = test_df.rename(columns=new_column_names)
train_df['timestamp']  = pd.to_datetime(train_df['timestamp'])
test_df['timestamp']  = pd.to_datetime(test_df['timestamp'])

In [10]:
def get_date_info(data) :
    # data의 날짜 정보 추출하기 
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['weekofyear'] = data['timestamp'].dt.isocalendar().week
    data['dayofyear'] = data['timestamp'].dt.dayofyear #해당 년도의 몇 일째 

def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

    
get_date_info(train_df)
get_date_info(test_df)

# 'month' 칼럼을 정수형으로 변환
train_df['season'] = train_df['month'].astype(int).apply(month_to_season)
test_df['season'] = test_df['month'].astype(int).apply(month_to_season)


In [11]:
train_df

Unnamed: 0,ID,timestamp,item,corp,loc,supply,price,year,month,day,weekday,weekofyear,dayofyear,season
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,1,Winter
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,2,1,2,Winter
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,3,1,3,Winter
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,4,1,4,Winter
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,5,1,5,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,0,9,58,Winter
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,1,9,59,Winter
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,2,9,60,Spring
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,3,9,61,Spring


In [6]:
def test_it(scaler, train_df) :
    
    train_x = train_df.drop('price', axis=1)
    train_y = train_df['price']
       
    scaler = scaler
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    
    model = XGBRegressor() 
    
    kf = KFold(n_splits=5, shuffle=True , random_state=43)

    ensemble_predicts= []
    scores =[]

    for train_idx, val_idx in tqdm(kf.split(train_x), total=5, desc="Processing folds"):
        X_t, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_t, y_val = train_y[train_idx], train_y[val_idx]
        
        # 두 모델 모두 학습
        model.fit(X_t, y_t)
        
        # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
        val_pred = model.predict(X_val)
        
        # Validation set에 대한 대회 평가 산식 계산 후 저장
        scores.append(mean_squared_error(y_val, val_pred))
        
        # # test 데이터셋에 대한 예측 수행 후 저장
        # model_pred = model.predict(test_x)
        # model_pred = np.where(model_pred < 0, 0, model_pred)
        
        # ensemble_predicts.append(model_pred)

    # K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
    final_predictions = np.mean(ensemble_predicts, axis=0)

    # 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
    # print("Validation : MAE scores for each fold:", scores)
    print("Validation : MSE:", np.mean(scores))
        