In [90]:
import os
import random
import numpy as np
import pandas as pd
from datetime import datetime
from holidays import country_holidays
from xgboost import XGBRegressor 
import xgboost as xgb 
from tqdm import tqdm

import optuna 
from optuna.samplers import TPESampler

from supervised.automl import AutoML

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn import base
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정
## Load Data
train_df = pd.read_csv('../../DATA/train.csv')
test_df  = pd.read_csv('../../DATA/test.csv')

In [91]:

## Make Date Data
# 데이터 타입 변경, 열 이름 변경 

new_column_names = {
    'corporation': 'corp',
    'location': 'loc',
    'supply(kg)': 'supply',
    'price(원/kg)': 'price',
}
train_df = train_df.rename(columns=new_column_names)
test_df = test_df.rename(columns=new_column_names)
train_df['timestamp']  = pd.to_datetime(train_df['timestamp'])
test_df['timestamp']  = pd.to_datetime(test_df['timestamp'])

def get_date_info(data) :
    # data의 날짜 정보 추출하기 
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['weekofyear'] = data['timestamp'].dt.isocalendar().week
    data['dayofyear'] = data['timestamp'].dt.dayofyear #해당 년도의 몇 일째 

def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

    
get_date_info(train_df)
get_date_info(test_df)

# 'month' 칼럼을 정수형으로 변환
train_df['season'] = train_df['month'].astype(int).apply(month_to_season)
test_df['season'] = test_df['month'].astype(int).apply(month_to_season)

In [92]:
# 휴일 정보 추가하니 성능 더 하락 

years  = [2019,2020,2021,2022,2023 ]
holi_dates=[]
for year in years :
    holidayarr = country_holidays('KR', years = year).items()
    for day in holidayarr:
        holi_dates.append(day[0])
    
print(holi_dates[0])

train_df['holi'] = train_df['timestamp'].apply(lambda x: int(x in holi_dates))
test_df['holi'] = test_df['timestamp'].apply(lambda x: int(x in holi_dates))
train_df.loc[(train_df['holi'] == 0) & (train_df['weekday'] >= 6), 'holi'] = 1
test_df.loc[(train_df['holi'] == 0) & (test_df['weekday'] >= 6), 'holi'] = 1

2019-01-01


In [93]:
def trans_fourier(data) :
    #날짜
    data['sin_date'] = -np.sin(2 * np.pi * (data['month']+data['day']/31)/12)
    data['cos_date'] = -np.cos(2 * np.pi * (data['month']+data['day']/31)/12)

    #월
    data['sin_month'] = -np.sin(2 * np.pi * data['month']/12.0)
    data['cos_month'] = -np.cos(2 * np.pi * data['month']/12.0)
    
    data['sin_week'] = -np.sin(2 * np.pi * (data['weekofyear'])/52.0)
    data['cos_week'] = -np.cos(2 * np.pi * (data['weekofyear'])/52.0)

    #요일
    data['sin_dayofweek'] = -np.sin(2 * np.pi * (data['weekday']+1)/7.0)
    data['cos_dayofweek'] = -np.cos(2 * np.pi * (data['weekday']+1)/7.0)
    
    
trans_fourier(train_df)
trans_fourier(test_df)

In [94]:
# Supply Data , New feature
# 아이템, 유통회사, 지역 묶기 
def new_columns(data):
    data['item_loc']= data['item'].astype(str).str.cat([ data['loc'].astype(str)], sep='')
    # data['item_corp']= data['item'].astype(str).str.cat([data['corp'].astype(str)], sep='')
    # data['item_loc']= data['item'].astype(str).str.cat([data['loc'].astype(str)], sep='')
    # data['item_loc'] =data['item_loc'].astype(str) 
    # data['item_corp'] = data['item_corp'].astype(str)
    # data['item_corp_loc']= data['item_corp_loc'].astype(str)
    
    # data['y_it_loc_m'] = data['item'].str.cat([data['loc'], data['year'].astype(str),data['month'].astype(str)], sep = '')

new_columns(train_df)
new_columns(test_df)

In [95]:

def fill_supply(train, test, start, end):
    train['timestamp'] = pd.to_datetime(train['timestamp'])  
    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)
    mask = (train['timestamp'] >= start_date) & (train['timestamp'] <= end_date)
    filtered_train_df = train.loc[mask]

    result = filtered_train_df.groupby('item_loc')['supply'].median()
    result = result.reset_index().rename(columns={'supply': 'supply'})

    test['supply'] = 0 
    test = test.set_index('item_loc')
    test.update(result.set_index('item_loc'))
    test = test.reset_index()

    return test 

test_df = fill_supply(train_df, test_df, "2023-01-11","2023-03-03")


In [79]:
# Imputaion Of Data 
# Encoding Categorical Features

class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
   
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)       
        
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True)        
        col_mean_name = self.colnames + '_' + 'encod'
        X[col_mean_name] = np.nan       
        
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)  
            
        if self.verbosity:            
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,                                                                                    np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))

        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    def __init__(self,train,colNames,encodedName):
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        mean = self.train[[self.colNames,
        self.encodedName]].groupby(
        self.colNames).mean().reset_index()
        dd = {}
        for row in (mean.itertuples(index=False)):
            dd[row[0]] = row[1]
            X[self.encodedName] = X[self.colNames]
            X[self.encodedName] = X[self.encodedName].map(dd.get)
        return X

In [96]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corp', 'loc']

for i in qual_col:
    le = LabelEncoder()
    train_df[i]=le.fit_transform(train_df[i])
    test_df[i]=le.transform(test_df[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

In [81]:

# # ,'item' ,'corp', 'loc' 

# for col in ['item_loc' ,'season']:
#     t_encoder = KFoldTargetEncoderTrain(colnames=col , targetName='price',n_fold=5)
#     train_df  = t_encoder.fit_transform(train_df)
    
#     newcolname = f'{col}_encod'
    
#     test_t_encoder = KFoldTargetEncoderTest(train_df, colNames=col , encodedName=newcolname)
#     test_df = test_t_encoder.transform(test_df)

Correlation between the new feature, item_loc_encod and, price is 0.6139104258203091.
Correlation between the new feature, season_encod and, price is 0.1430650204757649.


In [82]:
train_df.columns

Index(['ID', 'timestamp', 'item', 'corp', 'loc', 'supply', 'price', 'year',
       'month', 'day', 'weekday', 'weekofyear', 'dayofyear', 'season', 'holi',
       'sin_date', 'cos_date', 'sin_month', 'cos_month', 'sin_week',
       'cos_week', 'sin_dayofweek', 'cos_dayofweek', 'item_loc',
       'item_loc_encod', 'season_encod'],
      dtype='object')

In [97]:


# 질적 변수들을 수치화합니다
qual_col = ['item_loc' ,'season']

def encode_cols_v1(data,data2, cat_name , target , weight) :
    mean = data[target].mean()
    agg = data.groupby(cat_name)[target].agg(['count','mean'])
    counts =agg['count']
    means  =agg['mean']
    
    smooth = (counts*means + weight*mean)/ (counts+weight)
    data[cat_name] = data[cat_name].map(smooth) 
    data2[cat_name] = data2[cat_name].map(smooth)
    
    return data, data2 
    


for i in qual_col:
    train_df, test_df = encode_cols_v1(train_df,test_df, i, 'price', 5)

# scaler로 스케일링 해주기 


In [100]:
train_df

Unnamed: 0,ID,timestamp,item,corp,loc,supply,price,year,month,day,...,holi,sin_date,cos_date,sin_month,cos_month,sin_week,cos_week,sin_dayofweek,cos_dayofweek,item_loc
0,TG_A_J_20190101,2019-01-01,4,0,0,0.0,0.0,2019,1,1,...,0,-0.514555,-0.857457,-0.500000,-8.660254e-01,-0.120537,-0.992709,-0.974928,0.222521,3005.308058
1,TG_A_J_20190102,2019-01-02,4,0,0,0.0,0.0,2019,1,2,...,0,-0.528964,-0.848644,-0.500000,-8.660254e-01,-0.120537,-0.992709,-0.433884,0.900969,3005.308058
2,TG_A_J_20190103,2019-01-03,4,0,0,60601.0,1728.0,2019,1,3,...,0,-0.543222,-0.839589,-0.500000,-8.660254e-01,-0.120537,-0.992709,0.433884,0.900969,3005.308058
3,TG_A_J_20190104,2019-01-04,4,0,0,25000.0,1408.0,2019,1,4,...,0,-0.557324,-0.830295,-0.500000,-8.660254e-01,-0.120537,-0.992709,0.974928,0.222521,3005.308058
4,TG_A_J_20190105,2019-01-05,4,0,0,32352.0,1250.0,2019,1,5,...,0,-0.571268,-0.820763,-0.500000,-8.660254e-01,-0.120537,-0.992709,0.781831,-0.623490,3005.308058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,3,5,0,452440.0,468.0,2023,2,27,...,0,-0.997719,-0.067510,-0.866025,-5.000000e-01,-0.885456,-0.464723,-0.781831,-0.623490,252.715500
59393,RD_F_J_20230228,2023-02-28,3,5,0,421980.0,531.0,2023,2,28,...,0,-0.998717,-0.050649,-0.866025,-5.000000e-01,-0.885456,-0.464723,-0.974928,0.222521,252.715500
59394,RD_F_J_20230301,2023-03-01,3,5,0,382980.0,574.0,2023,3,1,...,0,-0.999857,0.016889,-1.000000,-6.123234e-17,-0.885456,-0.464723,-0.433884,0.900969,252.715500
59395,RD_F_J_20230302,2023-03-02,3,5,0,477220.0,523.0,2023,3,2,...,0,-0.999429,0.033774,-1.000000,-6.123234e-17,-0.885456,-0.464723,0.433884,0.900969,252.715500


In [101]:

#학습에 사용하지 않을 변수들을 제거합니다
# train_x = train_df.drop(columns=['ID','season', 'timestamp', 'price', 'day', 'month', 'weekday','item_loc','dayofyear','season','loc','item'])
# train_y = train_df['price']
# test_x = test_df.drop(columns=['ID', 'season','timestamp' ,  'day', 'month', 'weekday', 'item_loc','dayofyear','season', 'loc', 'item'])

train_x = train_df.drop(columns=['ID', 'timestamp', 'price', 'day', 'month', 'weekday','dayofyear','loc','item'])
train_y = train_df['price']
test_x = test_df.drop(columns=['ID','timestamp' ,  'day', 'month', 'weekday','dayofyear', 'loc', 'item'])

In [102]:
train_x

Unnamed: 0,corp,supply,year,weekofyear,season,holi,sin_date,cos_date,sin_month,cos_month,sin_week,cos_week,sin_dayofweek,cos_dayofweek,item_loc
0,0,0.0,2019,1,993.653610,0,-0.514555,-0.857457,-0.500000,-8.660254e-01,-0.120537,-0.992709,-0.974928,0.222521,3005.308058
1,0,0.0,2019,1,993.653610,0,-0.528964,-0.848644,-0.500000,-8.660254e-01,-0.120537,-0.992709,-0.433884,0.900969,3005.308058
2,0,60601.0,2019,1,993.653610,0,-0.543222,-0.839589,-0.500000,-8.660254e-01,-0.120537,-0.992709,0.433884,0.900969,3005.308058
3,0,25000.0,2019,1,993.653610,0,-0.557324,-0.830295,-0.500000,-8.660254e-01,-0.120537,-0.992709,0.974928,0.222521,3005.308058
4,0,32352.0,2019,1,993.653610,0,-0.571268,-0.820763,-0.500000,-8.660254e-01,-0.120537,-0.992709,0.781831,-0.623490,3005.308058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,5,452440.0,2023,9,993.653610,0,-0.997719,-0.067510,-0.866025,-5.000000e-01,-0.885456,-0.464723,-0.781831,-0.623490,252.715500
59393,5,421980.0,2023,9,993.653610,0,-0.998717,-0.050649,-0.866025,-5.000000e-01,-0.885456,-0.464723,-0.974928,0.222521,252.715500
59394,5,382980.0,2023,9,1531.028493,0,-0.999857,0.016889,-1.000000,-6.123234e-17,-0.885456,-0.464723,-0.433884,0.900969,252.715500
59395,5,477220.0,2023,9,1531.028493,0,-0.999429,0.033774,-1.000000,-6.123234e-17,-0.885456,-0.464723,0.433884,0.900969,252.715500


In [103]:
# 데이터 스플릿 
X_train, X_valid , y_train , y_valid  = train_test_split(train_x,train_y, shuffle=True,test_size=0.2) 

In [104]:
#XGB optuna 하기

def RMSE(true, pred):
    mse = np.mean((true - pred) ** 2)
    rmse = np.sqrt(mse)
    
    return rmse


def objective(trial, train_x, train_y, val_x, val_y):
    
    param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 4,8),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 50),
    }
    model =XGBRegressor(**param)  
    model.fit(train_x, train_y)
    preds = model.predict(val_x)
    score = RMSE(val_y, np.round(preds,0))
    
    return score
          
# study_xgb.optimize(objective, n_trials=5)
study =  optuna.create_study(study_name='Xgb', direction='minimize',sampler=TPESampler(seed=42) )
study.optimize(lambda trial: objective(trial,X_train,  y_train ,X_valid , y_valid ),n_trials=200, timeout = 3000 )
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)


[I 2023-11-03 23:48:30,641] A new study created in memory with name: Xgb
[I 2023-11-03 23:48:30,871] Trial 0 finished with value: 612.8815699391228 and parameters: {'lambda': 0.03807947176588889, 'alpha': 0.9507635921035062, 'colsample_bytree': 0.839196365086843, 'subsample': 0.759195090518222, 'learning_rate': 0.01568626218019941, 'n_estimators': 240, 'max_depth': 4, 'min_child_weight': 44}. Best is trial 0 with value: 612.8815699391228.
[I 2023-11-03 23:48:31,017] Trial 1 finished with value: 562.7933449839729 and parameters: {'lambda': 0.06051038616257767, 'alpha': 0.7083645052182495, 'colsample_bytree': 0.41235069657748147, 'subsample': 0.9819459112971965, 'learning_rate': 0.08326101981596214, 'n_estimators': 291, 'max_depth': 4, 'min_child_weight': 10}. Best is trial 1 with value: 562.7933449839729.
[I 2023-11-03 23:48:31,165] Trial 2 finished with value: 527.8309198657558 and parameters: {'lambda': 0.03111998205299424, 'alpha': 0.5252316752006057, 'colsample_bytree': 0.6591670111

Best trial: {'lambda': 0.06711062040812309, 'alpha': 0.05428032704100793, 'colsample_bytree': 0.8762574562285771, 'subsample': 0.898655889433224, 'learning_rate': 0.0787141774346471, 'n_estimators': 267, 'max_depth': 8, 'min_child_weight': 15}
Best score: 442.96497684023586


In [105]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [106]:
test_x =test_x.reindex(columns=train_x.columns)


In [119]:


#Validation : MAE: 729715.7680863414,  Best score: 856.2573364295463,  Best trial: {'lambda': 0.06858310598122817, 'alpha': 0.7713413681986561, 'colsample_bytree': 0.9442346449671033, 'subsample': 0.9750961748574901, 'learning_rate': 0.076433115411846, 'n_estimators': 446, 'max_depth': 8, 'min_child_weight': 16}

#encod_col_v1 : Validation : MAE: 722972.6923, Best score: 864.4005832556281, Best trial: {'lambda': 0.07246551885960752, 'alpha': 0.5233846374744797, 'colsample_bytree': 0.9008060223667523, 'subsample': 0.9996254959269878, 'learning_rate': 0.061161155234371184, 'n_estimators': 753, 'max_depth': 8, 'min_child_weight': 23}


model_xgb = XGBRegressor(**study.best_trial.params)
kf = KFold(n_splits=5, shuffle=True , random_state=20331892)

ensemble_predicts= []
scores =[]


for train_idx, val_idx in tqdm(kf.split(train_x), total=5, desc="Processing folds"):
    X_t, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_t, y_val = train_y[train_idx], train_y[val_idx]
    
    # 두 모델 모두 학습
    model_xgb.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = model_xgb.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_squared_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    model_xgb_pred = model_xgb.predict(test_x)
    model_xgb_pred = np.where(model_xgb_pred < 0, 0, model_xgb_pred)
    
    ensemble_predicts.append(model_xgb_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predicts, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))


Processing folds: 100%|██████████| 5/5 [00:02<00:00,  2.16it/s]

Validation : MAE scores for each fold: [197201.0902877689, 182232.52566671604, 183098.41183332299, 211919.18835872054, 186713.9918179762]
Validation : MAE: 192233.04159290093





In [120]:
final_predictions = np.round(final_predictions, 1)
final_predictions

array([3689.1, 2958.2, 3499.3, ...,  668.1,  630.3,  586. ], dtype=float32)

In [121]:
submission = pd.read_csv('../../DATA/sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [122]:
submission['answer'] = final_predictions
# submission.loc[submission.index % 7 == 1, 'answer'] = 0

In [123]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3689.100098
1,TG_A_J_20230305,2958.199951
2,TG_A_J_20230306,3499.300049
3,TG_A_J_20230307,3392.199951
4,TG_A_J_20230308,3388.300049
...,...,...
1087,RD_F_J_20230327,679.200012
1088,RD_F_J_20230328,685.900024
1089,RD_F_J_20230329,668.099976
1090,RD_F_J_20230330,630.299988


In [124]:
submission.to_csv('../../DATA/SUBMIT/new3_opt_xgb_tarenc_sub.csv', index=False)


In [125]:
sub1 = pd.read_csv('../../DATA/SUBMIT/new_opt_xgb_tarenc_sub.csv')
sub2 = pd.read_csv('../../DATA/SUBMIT/new2_opt_xgb_tarenc_sub.csv')
sub3 = pd.read_csv('../../DATA/SUBMIT/new3_opt_xgb_tarenc_sub.csv')

In [127]:
sub1['answer'] = sub2['answer']+sub3['answer'] +sub1['answer']

In [129]:
sub1['answer'] = sub1['answer']/3


In [131]:
sub1['answer'] =np.round(sub1['answer'],0)

In [135]:
sub1.to_csv('../../DATA/SUBMIT/new_ens_opt_xgb_tarenc_sub2.csv', index=False)

In [134]:
sub1
sub1.loc[submission.index % 7 == 1, 'answer'] = 0