## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

from pycaret.regression import * 
from pycaret.utils import *
from pycaret import * 

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:


train_df = pd.read_csv('../../DATA/train.csv')
test_df  = pd.read_csv('../../DATA/test.csv')

## Data Pre-Processing

In [4]:
# 데이터 타입 변경, 열 이름 변경 

new_column_names = {
    'corporation': 'corp',
    'location': 'loc',
    'supply(kg)': 'supply',
    'price(원/kg)': 'price',
}

train_df = train_df.rename(columns=new_column_names)
test_df = test_df.rename(columns=new_column_names)

train_df['timestamp']  = pd.to_datetime(train_df['timestamp'])
test_df['timestamp']  = pd.to_datetime(test_df['timestamp'])

In [5]:
def get_date_info(data) :
    # data의 날짜 정보 추출하기 
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['quarter'] = data['timestamp'].dt.quarter

    data['weekofyear'] = data['timestamp'].dt.weekofyear # 해당 년도의 몇 주차 
    data['dayofyear'] = data['timestamp'].dt.dayofyear #해당 년도의 몇 일째 
    
    
get_date_info(train_df)
get_date_info(test_df)

In [6]:
def trans_fourier(data) :
    #날짜
    data['sin_date'] = -np.sin(2 * np.pi * (data['month']+data['day']/31)/12)
    data['cos_date'] = -np.cos(2 * np.pi * (data['month']+data['day']/31)/12)
    #월
    data['sin_month'] = -np.sin(2 * np.pi * data['month']/12.0)
    data['cos_month'] = -np.cos(2 * np.pi * data['month']/12.0)
    #요일
    data['sin_dayofweek'] = -np.sin(2 * np.pi * (data['weekday']+1)/7.0)
    data['cos_dayofweek'] = -np.cos(2 * np.pi * (data['weekday']+1)/7.0)
    
    
trans_fourier(train_df)
trans_fourier(test_df)

In [7]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply', 'price', 'day', 'month', 'weekday'])
train_y = train_df['price']

test_x = test_df.drop(columns=['ID', 'timestamp' ,  'day', 'month', 'weekday'])

In [8]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corp', 'loc']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [9]:
# pycaret을 위한 설정 
pycaret_train = pd.concat([train_x, train_y], axis= 1)


## Pycaret 실행하기

In [10]:
reg = setup(data = pycaret_train, target='price', session_id= 123, train_size=0.9, use_gpu=True)

[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recomp

Unnamed: 0,Description,Value
0,Session id,123
1,Target,price
2,Target type,Regression
3,Original data shape,"(59397, 14)"
4,Transformed data shape,"(59397, 14)"
5,Transformed train set shape,"(53457, 14)"
6,Transformed test set shape,"(5940, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


In [12]:
bestmodels = compare_models(fold = 5, n_select=5,  sort='RMSE')

#xgb,rf,lgbm (1157,1196,1206) : 전처리 x 

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,306.4638,738128.2995,859.0649,0.8211,2.1376,0.19,0.512
lightgbm,Light Gradient Boosting Machine,401.4894,778643.9202,882.2397,0.8114,3.5058,0.2578,0.124
et,Extra Trees Regressor,312.885,798179.8892,893.2888,0.8065,2.1041,0.1961,0.346
gbr,Gradient Boosting Regressor,569.4849,1143603.2981,1069.3209,0.7229,4.133,0.3186,2.192
dt,Decision Tree Regressor,366.7949,1341888.6857,1158.2217,0.675,2.0605,0.2519,0.098
knn,K Neighbors Regressor,704.0942,1555871.15,1247.2623,0.6229,3.5399,0.5628,0.116
ada,AdaBoost Regressor,991.5272,1922075.5691,1384.8203,0.5337,5.0685,0.4613,0.894
ridge,Ridge Regression,1329.9146,3390260.745,1841.0887,0.1789,5.1749,0.964,0.016
lr,Linear Regression,1329.9163,3390261.1215,1841.0888,0.1789,5.1749,0.964,0.026
br,Bayesian Ridge,1329.8078,3390276.8533,1841.093,0.1789,5.1748,0.9635,0.026
