In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
import holidays

from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *

warnings.filterwarnings(action='ignore') 

In [2]:
# 필요 정보
# item_lst : item 리스트 항목
# qual_col : one-hot-encoder 변환이 필요한 항목
item_lst = ['TG', 'CR', 'CB', 'RD', 'BC']
qual_col = ['day_name']

# 한국 2019~2023 공휴일 정보
kr_holidays = holidays.KR(years=[2019,2020,2021,2022,2023])
kr_holidays = {str(i):kr_holidays[i] for i in kr_holidays}

In [3]:
path = './data/train.csv'
test_path = './data/test.csv'

train = pd.read_csv(path)
test = pd.read_csv(test_path)

time = pd.to_datetime(train['timestamp'].copy())

train['Date'] = pd.to_datetime(train['timestamp'])
train['week'] = train['Date'].apply(lambda x: x.isocalendar()[1]) # 일요일 제거를 위함
train['day_name'] = train['Date'].dt.day_name()

train['year'] = train['timestamp'].apply(lambda x : int(x[0:4]))
train['month'] = train['timestamp'].apply(lambda x : int(x[5:7]))
train['day'] = train['timestamp'].apply(lambda x : int(x[8:10]))
train['holiday'] = [kr_holidays[i] if i in list(kr_holidays.keys()) else 'None' for i in train['timestamp']]

test['year'] = test['timestamp'].apply(lambda x : int(x[0:4]))
test['month'] = test['timestamp'].apply(lambda x : int(x[5:7]))
test['day'] = test['timestamp'].apply(lambda x : int(x[8:10]))
test['Date'] = pd.to_datetime(test['timestamp'])
test['week'] = test['Date'].apply(lambda x: x.isocalendar()[1]) # 일요일 제거를 위함
test['day_name'] = test['Date'].dt.day_name()
test['holiday'] = [kr_holidays[i] if i in list(kr_holidays.keys()) else 'None' for i in test['timestamp']]

train = pd.get_dummies(train, columns=qual_col, prefix=qual_col)
test = pd.get_dummies(test, columns=qual_col, prefix=qual_col)

le = LabelEncoder()
train['holiday']=le.fit_transform(train['holiday'])
test['holiday']=le.transform(test['holiday']) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

In [4]:
predictions = []

for i in train['item'].unique():
    print(f"============== {i} Start =============")

    train_sample = train[(train['item']==i)]#.groupby('timestamp').mean()
    test_sample = test[(test['item']==i)]
    
    if len(test_sample) == 0: continue

    train_x = train_sample.drop(columns=['supply(kg)'])
    s = setup(train_x, target = 'price(원/kg)', 
        session_id = 123, 
        normalize = True, 
        fold=5,
        normalize_method = 'minmax',
        train_size=0.8
        )

    reg = create_model('rf')
    tuned_reg = tune_model(reg, optimize = 'RMSE')
    prediction = predict_model(tuned_reg, data = pd.DataFrame(test_sample))
    predictions.extend(prediction['prediction_label'])
        
    print(f"============== {i} END =====-========")




Unnamed: 0,Description,Value
0,Session id,123
1,Target,price(원/kg)
2,Target type,Regression
3,Original data shape,"(15230, 19)"
4,Transformed data shape,"(15230, 25)"
5,Transformed train set shape,"(12184, 25)"
6,Transformed test set shape,"(3046, 25)"
7,Ordinal features,1
8,Numeric features,12
9,Date features,1


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2466.0634,8180826.5966,2860.2144,-0.0001,4.0027,0.6522
1,2479.5459,8166414.1318,2857.6938,-0.0,3.9846,0.6794
2,2429.9851,7767998.8462,2787.113,-0.0003,3.9383,0.6921
3,2379.9931,7545970.1045,2746.9929,-0.0011,3.7337,0.649
4,2437.152,8099266.7995,2845.9211,0.0,4.0512,0.6556
5,2407.4377,7659448.2878,2767.5708,-0.0003,3.9461,0.6813
6,2410.9127,7845253.9937,2800.9381,-0.0003,3.9594,0.676
7,2410.1231,7955268.4631,2820.5085,-0.0018,3.7642,0.6601
8,2482.3277,8135341.2734,2852.252,0.0001,4.0446,0.6625
9,2383.8982,7758453.7903,2785.4001,-0.0012,4.0692,0.6314


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


KeyboardInterrupt: 

In [None]:
# predictions = []

# for i in train['item'].unique():
#     print(f"============== {i} Start =============")
    
#     for j in train[train['item']==i]['location'].unique():

#             train_sample = train[(train['item']==i)&(train['location']==j)].groupby('timestamp').median()
#             test_sample = test[(test['item']==i)&(test['location']==j)]
            
#             if len(test_sample) == 0: continue

#             train_x = train_sample.drop(columns=['supply(kg)'])
#             s = setup(train_x, target = 'price(원/kg)', 
#                 session_id = 123, 
#                 normalize = True, 
#                 normalize_method = 'minmax',
#                 train_size=0.6
#                 )

#             reg = compare_models()
#             test_sample_index = test_sample.index
#             prediction = predict_model(reg, data = pd.DataFrame(test_sample))
#             predictions.extend(prediction['prediction_label'])
        
#     print(f"============== {i} END =====-========")


In [None]:
# predictions = []

# for i in train['item'].unique():
#     print(f"============== {i} Start =============")
    
#     for j in train[train['item']==i]['location'].unique():

#         for k in train[(train['item']==i)&(train['location']==j)]['corporation'].unique():

#             train_sample = train[(train['item']==i)&(train['location']==j)&(train['corporation']==k)]
#             test_sample = test[(test['item']==i)&(test['location']==j)&(test['corporation']==k)]
            
#             if len(test_sample) == 0: continue

#             train_x = train_sample.drop(columns=['supply(kg)'])
#             s = setup(train_x, target = 'price(원/kg)', 
#                 session_id = 123, 
#                 fold = 5, 
#                 normalize = True, 
#                 normalize_method = 'minmax',
#                 train_size=0.8)

#             reg = compare_models()
#             test_sample_index = test_sample.index
#             prediction = predict_model(reg, data = pd.DataFrame(test_sample))
#             predictions.extend(prediction['prediction_label'])
        
#     print(f"============== {i} END =====-========")


In [None]:
# predictions = pd.DataFrame()

# for i in train['item'].unique():
#     print(f"============== {i} Start =============")
    
#     for j in train[train['item']==i]['location'].unique():
#         for k in train[(train['item']==i)&(train['location']==j)]['corporation'].unique():
#             train_sample = train[(train['item']==i)&(train['location']==j)&(train['corporation']==k)].groupby('timestamp').mean()
#             test_sample = test[(test['item']==i)&(test['location']==j)&(test['corporation']==k)].groupby('timestamp').mean()
            
#             if len(test_sample) == 0: continue

#             train_x = train_sample.drop(columns=['supply(kg)'])
#             s = setup(train_x, target = 'price(원/kg)', 
#                 session_id = 123, 
#                 fold = 5, 
#                 normalize = True, 
#                 normalize_method = 'minmax',
#                 train_size=0.9)

#             reg = compare_models()
#             test_sample_index = test_sample.index
#             prediction = predict_model(reg, data = pd.DataFrame(test_sample))
#             predictions = pd.concat([predictions, prediction])
        
#     print(f"============== {i} END =====-========")


In [None]:
result = pd.read_csv(test_path)
result['answer'] = predictions

del result['timestamp']
del result['item']
del result['corporation']
del result['location']

for i in range(len(result)):
    if (i-1) % 7 == 0: # 1, 8, 15, 22
        result.iloc[i,-1] = 0


In [None]:
result.to_csv('result4_case_by_case1.csv', index = False)
result

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3170.68
1,TG_A_J_20230305,0.00
2,TG_A_J_20230306,3170.72
3,TG_A_J_20230307,3170.77
4,TG_A_J_20230308,3170.77
...,...,...
1087,RD_F_J_20230327,890.87
1088,RD_F_J_20230328,890.16
1089,RD_F_J_20230329,893.16
1090,RD_F_J_20230330,890.18
