## Import

In [77]:
import random
import pandas as pd
import numpy as np
from holidays import country_holidays
import os
from xgboost import XGBRegressor 
import xgboost as xgb 
import optuna 
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [78]:
train_df = pd.read_csv('../../DATA/train.csv')
test_df  = pd.read_csv('../../DATA/test.csv')

## Data Pre-Processing

In [79]:
# 데이터 타입 변경, 열 이름 변경 

new_column_names = {
    'corporation': 'corp',
    'location': 'loc',
    'supply(kg)': 'supply',
    'price(원/kg)': 'price',
}
train_df = train_df.rename(columns=new_column_names)
test_df = test_df.rename(columns=new_column_names)
train_df['timestamp']  = pd.to_datetime(train_df['timestamp'])
test_df['timestamp']  = pd.to_datetime(test_df['timestamp'])


In [80]:
def get_date_info(data) :
    # data의 날짜 정보 추출하기 
    
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['quarter'] = data['timestamp'].dt.quarter
    data['weekofyear'] = data['timestamp'].dt.isocalendar().week
    data['dayofyear'] = data['timestamp'].dt.dayofyear #해당 년도의 몇 일째 
    data['holi'] = 0 
    data.loc[(data['holi'] == 0) & (data['weekday'] >= 6), 'holi'] = 1
    
get_date_info(train_df)
get_date_info(test_df)

In [82]:
def trans_fourier(data) :
    #날짜
    data['sin_date'] = -np.sin(2 * np.pi * (data['month']+data['day']/31)/12)
    data['cos_date'] = -np.cos(2 * np.pi * (data['month']+data['day']/31)/12)

    #월
    data['sin_month'] = -np.sin(2 * np.pi * data['month']/12.0)
    data['cos_month'] = -np.cos(2 * np.pi * data['month']/12.0)

    #요일
    data['sin_dayofweek'] = -np.sin(2 * np.pi * (data['weekday']+1)/7.0)
    data['cos_dayofweek'] = -np.cos(2 * np.pi * (data['weekday']+1)/7.0)
    
    data['sin_week'] = -np.sin(2 * np.pi * (data['weekofyear'])/52.0)
    data['cos_week'] = -np.cos(2 * np.pi * (data['weekofyear'])/52.0)
    
    data['sin_quat'] = -np.sin(2 * np.pi * (data['quarter'])/4.0)
    data['cos_quat'] = -np.cos(2 * np.pi * (data['quarter'])/4.0)
    
trans_fourier(train_df)
trans_fourier(test_df)

# Imputaion Of Data 

In [83]:
# 보간할 날, 가격 데이터 생성 
average_price = train_df.groupby(['year', 'month', 'item', 'corp','loc'])['price'].mean().reset_index()

emptydays  = ['2019-01-01',  '2019-02-05', '2019-02-06', '2019-09-13', '2019-09-14', '2020-01-01', '2020-01-25', '2020-01-27', '2020-10-01', '2020-10-02', '2020-10-03', '2021-01-01', '2021-02-12', '2021-02-13', 
'2021-09-20', '2021-09-21', '2021-09-22', '2022-01-01', '2022-01-31', '2022-02-01', '2022-02-02', '2022-09-10', '2022-09-12', '2023-01-23', '2023-01-24']
# 25일에 해당하는 값들을 보간해야해 
emptydays = [datetime.strptime(day, "%Y-%m-%d") for day in emptydays]


In [84]:
# 공휴일 가격 데이터 보간하기 
for i in range(len(train_df)) :
    if train_df.loc[i, 'timestamp'] in emptydays :
        year,month,item, corp, loc = train_df.loc[i,'year'],train_df.loc[i,'month'],train_df.loc[i,'item'],train_df.loc[i,'corp'],train_df.loc[i,'loc']
        newprice = average_price[(average_price['year']==year) & (average_price['month']==month) & (average_price['item']==item) & (average_price['corp']==corp) & (average_price['loc']==loc)]['price']
        train_df.loc[i,'price'] = newprice.values[0]

# Encoding Categorical Features

In [86]:
train_df['item_loc'] = train_df['item'].str.cat(train_df['loc'],sep='')

In [71]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corp', 'loc','item_loc']

def encode_cols_v1(data,data2, cat_name , target , weight) :
    mean = data[target].mean()
    agg = data.groupby(cat_name)[target].agg(['count','mean'])
    counts =agg['count']
    means  =agg['mean']
    
    smooth = (counts*means + weight*mean)/ (counts+weight)
    data[cat_name] = data[cat_name].map(smooth) 
    data2[cat_name] = data2[cat_name].map(smooth)
    
    return data, data2 
    


for i in qual_col:
    train_df, test_df = encode_cols_v1(train_df,test_df, i, 'price', 5)


In [76]:
train_df

Unnamed: 0,ID,timestamp,item,corp,loc,supply,price,year,month,day,...,sin_date,cos_date,sin_month,cos_month,sin_dayofweek,cos_dayofweek,sin_week,cos_week,sin_quat,cos_quat
0,TG_A_J_20190101,2019-01-01,3212.974445,1157.495017,1077.1376,0.0,1513.516129,2019,1,1,...,-0.514555,-0.857457,-0.500000,-8.660254e-01,-0.974928,0.222521,-0.120537,-0.992709,-1.0,-6.123234e-17
1,TG_A_J_20190102,2019-01-02,3212.974445,1157.495017,1077.1376,0.0,0.000000,2019,1,2,...,-0.528964,-0.848644,-0.500000,-8.660254e-01,-0.433884,0.900969,-0.120537,-0.992709,-1.0,-6.123234e-17
2,TG_A_J_20190103,2019-01-03,3212.974445,1157.495017,1077.1376,60601.0,1728.000000,2019,1,3,...,-0.543222,-0.839589,-0.500000,-8.660254e-01,0.433884,0.900969,-0.120537,-0.992709,-1.0,-6.123234e-17
3,TG_A_J_20190104,2019-01-04,3212.974445,1157.495017,1077.1376,25000.0,1408.000000,2019,1,4,...,-0.557324,-0.830295,-0.500000,-8.660254e-01,0.974928,0.222521,-0.120537,-0.992709,-1.0,-6.123234e-17
4,TG_A_J_20190105,2019-01-05,3212.974445,1157.495017,1077.1376,32352.0,1250.000000,2019,1,5,...,-0.571268,-0.820763,-0.500000,-8.660254e-01,0.781831,-0.623490,-0.120537,-0.992709,-1.0,-6.123234e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,190.968198,264.841281,1077.1376,452440.0,468.000000,2023,2,27,...,-0.997719,-0.067510,-0.866025,-5.000000e-01,-0.781831,-0.623490,-0.885456,-0.464723,-1.0,-6.123234e-17
59393,RD_F_J_20230228,2023-02-28,190.968198,264.841281,1077.1376,421980.0,531.000000,2023,2,28,...,-0.998717,-0.050649,-0.866025,-5.000000e-01,-0.974928,0.222521,-0.885456,-0.464723,-1.0,-6.123234e-17
59394,RD_F_J_20230301,2023-03-01,190.968198,264.841281,1077.1376,382980.0,574.000000,2023,3,1,...,-0.999857,0.016889,-1.000000,-6.123234e-17,-0.433884,0.900969,-0.885456,-0.464723,-1.0,-6.123234e-17
59395,RD_F_J_20230302,2023-03-02,190.968198,264.841281,1077.1376,477220.0,523.000000,2023,3,2,...,-0.999429,0.033774,-1.000000,-6.123234e-17,0.433884,0.900969,-0.885456,-0.464723,-1.0,-6.123234e-17


In [73]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply', 'price',  'month', 'weekday'])
train_y = train_df['price']
test_x = test_df.drop(columns=['ID', 'timestamp' ,   'month', 'weekday'])

# Optuna setting (XGboost, RandomForest)

In [74]:
# 데이터 스플릿 
X_train, X_valid , y_train , y_valid  = train_test_split(train_x,train_y, shuffle=True,test_size=0.2) 

In [75]:
#XGB optuna 하기

def RMSE(true, pred):
    mse = np.mean((true - pred) ** 2)
    rmse = np.sqrt(mse)
    
    return rmse


def objective(trial, train_x, train_y, val_x, val_y):
    
    param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 4,8),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 50),
    }
    model =XGBRegressor(**param)  
    model.fit(train_x, train_y)
    preds = model.predict(val_x)
    score = RMSE(val_y, np.round(preds,0))
    
    return score
          
# study_xgb.optimize(objective, n_trials=5)
study =  optuna.create_study(study_name='Xgb', direction='minimize',sampler=TPESampler(seed=42) )
study.optimize(lambda trial: objective(trial,X_train,  y_train ,X_valid , y_valid ),n_trials=400, timeout = 3000 )
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)


[I 2023-11-02 20:05:41,990] A new study created in memory with name: Xgb
[I 2023-11-02 20:05:42,318] Trial 0 finished with value: 1039.005137666728 and parameters: {'lambda': 0.03807947176588889, 'alpha': 0.9507635921035062, 'colsample_bytree': 0.839196365086843, 'subsample': 0.759195090518222, 'learning_rate': 0.01568626218019941, 'n_estimators': 240, 'max_depth': 4, 'min_child_weight': 44}. Best is trial 0 with value: 1039.005137666728.
[I 2023-11-02 20:05:42,578] Trial 1 finished with value: 994.0541410635966 and parameters: {'lambda': 0.06051038616257767, 'alpha': 0.7083645052182495, 'colsample_bytree': 0.41235069657748147, 'subsample': 0.9819459112971965, 'learning_rate': 0.08326101981596214, 'n_estimators': 291, 'max_depth': 4, 'min_child_weight': 10}. Best is trial 1 with value: 994.0541410635966.
[I 2023-11-02 20:05:42,750] Trial 2 finished with value: 948.568397773566 and parameters: {'lambda': 0.03111998205299424, 'alpha': 0.5252316752006057, 'colsample_bytree': 0.65916701118

Best trial: {'lambda': 0.04198505255710267, 'alpha': 0.9195173756052971, 'colsample_bytree': 0.8969320070014385, 'subsample': 0.9509196360850879, 'learning_rate': 0.06829396484792498, 'n_estimators': 109, 'max_depth': 8, 'min_child_weight': 14}
Best score: 857.0062180709282


In [None]:
# RandomForest optuna 하기 
def RMSE(true, pred):
    mse = np.mean((true - pred) ** 2)
    rmse = np.sqrt(mse)
    return rmse
def objective_RF(trial, train_x, train_y, val_x, val_y):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4500,10),
        'criterion': 'squared_error',
        'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 3, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 10),
        'min_weight_fraction_leaf': 0.0,
        'max_features': 1.0,
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'bootstrap': True,
        'n_jobs': None,
        'random_state': 42
    }
    model = RandomForestRegressor(**param)
    model.fit(train_x, train_y)
    pred_y = model.predict(val_x)
    return RMSE(val_y, pred_y)

study = optuna.create_study(study_name='RF', direction='minimize', sampler=TPESampler(seed=42))

study.optimize(lambda trial: objective_RF(trial, X_train,  y_train ,X_valid , y_valid),   n_trials=100, timeout=2000 )

print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)

[I 2023-10-29 23:04:09,415] A new study created in memory with name: RF
[I 2023-10-29 23:05:46,831] Trial 0 finished with value: 907.8383983112482 and parameters: {'n_estimators': 2310, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 7}. Best is trial 0 with value: 907.8383983112482.
[I 2023-10-29 23:06:18,784] Trial 1 finished with value: 1150.7132267593058 and parameters: {'n_estimators': 1540, 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 907.8383983112482.
[I 2023-10-29 23:08:10,733] Trial 2 finished with value: 976.3321464733516 and parameters: {'n_estimators': 3100, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 10}. Best is trial 0 with value: 907.8383983112482.
[I 2023-10-29 23:09:32,228] Trial 3 finished with value: 1150.816029265101 and parameters: {'n_estimators': 3920, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 0 with value: 907.8383983112482.
[I 2023-10-29 23:10:39,66

Best trial: {'n_estimators': 1260, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4}
Best score: 906.330978760372


## XGB Model Fit

In [90]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [52]:

#Validation : MAE: 729715.7680863414,  Best score: 856.2573364295463,  Best trial: {'lambda': 0.06858310598122817, 'alpha': 0.7713413681986561, 'colsample_bytree': 0.9442346449671033, 'subsample': 0.9750961748574901, 'learning_rate': 0.076433115411846, 'n_estimators': 446, 'max_depth': 8, 'min_child_weight': 16}





model_xgb = RandomForestRegressor(**study.best_trial.params)
kf = KFold(n_splits=5, shuffle=True , random_state=42)

ensemble_predicts= []
scores =[]


for train_idx, val_idx in tqdm(kf.split(train_x), total=5, desc="Processing folds"):
    X_t, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_t, y_val = train_y[train_idx], train_y[val_idx]
    
    # 두 모델 모두 학습
    model_xgb.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = model_xgb.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_squared_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    model_xgb_pred = model_xgb.predict(test_x)
    model_xgb_pred = np.where(model_xgb_pred < 0, 0, model_xgb_pred)
    
    ensemble_predicts.append(model_xgb_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predicts, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))


TypeError: RandomForestRegressor.__init__() got an unexpected keyword argument 'lambda'

In [91]:


#Validation : MAE: 729715.7680863414,  Best score: 856.2573364295463,  Best trial: {'lambda': 0.06858310598122817, 'alpha': 0.7713413681986561, 'colsample_bytree': 0.9442346449671033, 'subsample': 0.9750961748574901, 'learning_rate': 0.076433115411846, 'n_estimators': 446, 'max_depth': 8, 'min_child_weight': 16}

#encod_col_v1 : Validation : MAE: 722972.6923, Best score: 864.4005832556281, Best trial: {'lambda': 0.07246551885960752, 'alpha': 0.5233846374744797, 'colsample_bytree': 0.9008060223667523, 'subsample': 0.9996254959269878, 'learning_rate': 0.061161155234371184, 'n_estimators': 753, 'max_depth': 8, 'min_child_weight': 23}


model_xgb = XGBRegressor(**study.best_trial.params)
kf = KFold(n_splits=5, shuffle=True , random_state=42)

ensemble_predicts= []
scores =[]


for train_idx, val_idx in tqdm(kf.split(train_x), total=5, desc="Processing folds"):
    X_t, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_t, y_val = train_y[train_idx], train_y[val_idx]
    
    # 두 모델 모두 학습
    model_xgb.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = model_xgb.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_squared_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    model_xgb_pred = model_xgb.predict(test_x)
    model_xgb_pred = np.where(model_xgb_pred < 0, 0, model_xgb_pred)
    
    ensemble_predicts.append(model_xgb_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predicts, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))


Processing folds: 100%|██████████| 5/5 [00:01<00:00,  3.78it/s]

Validation : MAE scores for each fold: [745535.6648649025, 697863.7128490201, 686552.4108977464, 794330.6018247355, 644345.6000407968]
Validation : MAE: 713725.5980954402





## Inference

In [92]:
final_predictions = np.round(final_predictions, 0)
final_predictions

array([3565.,   14., 3718., ...,  508.,  504.,  498.], dtype=float32)

## Submission

In [93]:
submission = pd.read_csv('../../DATA/sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [94]:
submission['answer'] = final_predictions
submission.loc[submission.index % 7 == 1, 'answer'] = 0

In [95]:
submission.head(20)

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3565.0
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,3718.0
3,TG_A_J_20230307,3630.0
4,TG_A_J_20230308,3674.0
5,TG_A_J_20230309,3775.0
6,TG_A_J_20230310,3791.0
7,TG_A_J_20230311,3435.0
8,TG_A_J_20230312,0.0
9,TG_A_J_20230313,3623.0


In [96]:
submission.to_csv('../../DATA/SUBMIT/opt_xgb_imput2_tarenc_sub.csv', index=False)
