In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
import holidays

from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *

warnings.filterwarnings(action='ignore') 

In [13]:
# 필요 정보
# item_lst : item 리스트 항목
# qual_col : one-hot-encoder 변환이 필요한 항목
item_lst = ['TG', 'CR', 'CB', 'RD', 'BC']
qual_col = ['day_name']
columns_dict_train = {
    'TG' : ['month', 'week', 'day', 'supply(kg)_y', '수출 금액', '무역수지', '전체 거래량', '수출 중량', 'corporation_C','price(원/kg)'],
    'CR' : ['location_S', 'week', 'month', 'year', 'corporation_A', 'location_J', 'supply(kg)_y','price(원/kg)'],
    'CB' : ['month', 'week', 'location_S', '무역수지', '수출 금액', '수출 중량', 'corporation_F', 'location_J', 'supply(kg)_y','price(원/kg)'],
    'RD' : ['week', 'month', 'corporation_C', 'location_S', 'corporation_A', 'location_J', 'supply(kg)_y','price(원/kg)'],
    'BC' : ['location_S', 'corporation_C', 'week', 'corporation_B', 'corporation_A', 'location_J', 'supply(kg)_y','price(원/kg)']
}

columns_dict_test= {
    'TG' : ['month', 'week', 'day', 'supply(kg)', '수출 금액', '무역수지', '전체 거래량', '수출 중량', 'corporation_C','price(원/kg)'],
    'CR' : ['location_S', 'week', 'month', 'year', 'corporation_A', 'location_J', 'supply(kg)','price(원/kg)'],
    'CB' : ['month', 'week', 'location_S', '무역수지', '수출 금액', '수출 중량', 'corporation_F', 'location_J', 'supply(kg)','price(원/kg)'],
    'RD' : ['week', 'month', 'corporation_C', 'location_S', 'corporation_A', 'location_J', 'supply(kg)','price(원/kg)'],
    'BC' : ['location_S', 'corporation_C', 'week', 'corporation_B', 'corporation_A', 'location_J', 'supply(kg)','price(원/kg)']
}

# 한국 2019~2023 공휴일 정보
kr_holidays = holidays.KR(years=[2019,2020,2021,2022,2023])
kr_holidays = {str(i):kr_holidays[i] for i in kr_holidays}

In [10]:
path = './processed_data/train_merge.csv'
test_path = './processed_data/test_merge.csv'

train = pd.read_csv(path)
test = pd.read_csv(test_path)

time = pd.to_datetime(train['timestamp'].copy())

train['Date'] = pd.to_datetime(train['timestamp'])
train['week'] = train['Date'].apply(lambda x: x.isocalendar()[1]) # 일요일 제거를 위함
train['day_name'] = train['Date'].dt.day_name()

train['year'] = train['timestamp'].apply(lambda x : int(x[0:4]))
train['month'] = train['timestamp'].apply(lambda x : int(x[5:7]))
train['day'] = train['timestamp'].apply(lambda x : int(x[8:10]))
train['holiday'] = [kr_holidays[i] if i in list(kr_holidays.keys()) else 'None' for i in train['timestamp']]

test['year'] = test['timestamp'].apply(lambda x : int(x[0:4]))
test['month'] = test['timestamp'].apply(lambda x : int(x[5:7]))
test['day'] = test['timestamp'].apply(lambda x : int(x[8:10]))
test['Date'] = pd.to_datetime(test['timestamp'])
test['week'] = test['Date'].apply(lambda x: x.isocalendar()[1]) # 일요일 제거를 위함
test['day_name'] = test['Date'].dt.day_name()
test['holiday'] = [kr_holidays[i] if i in list(kr_holidays.keys()) else 'None' for i in test['timestamp']]

train = pd.get_dummies(train, columns=qual_col, prefix=qual_col)
test = pd.get_dummies(test, columns=qual_col, prefix=qual_col)

le = LabelEncoder()
train['holiday']=le.fit_transform(train['holiday'])
test['holiday']=le.transform(test['holiday']) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

In [11]:
predictions = []

for i in train['item'].unique():
    print(f"============== {i} Start =============")

    train_sample = train[(train['item']==i)]#.groupby('timestamp').mean()
    test_sample = test[(test['item']==i)]
    
    if len(test_sample) == 0: continue

    train_x = train_sample.drop(columns=['supply(kg)'])
    s = setup(train_x, target = 'price(원/kg)', 
        session_id = 123, 
        normalize = True, 
        fold=5,
        normalize_method = 'minmax',
        train_size=0.8
        )

    reg = create_model('rf')
    tuned_reg = tune_model(reg, optimize = 'RMSE')
    prediction = predict_model(tuned_reg, data = pd.DataFrame(test_sample))
    predictions.extend(prediction['prediction_label'])
        
    print(f"============== {i} END =====-========")




KeyError: "['supply(kg)'] not found in axis"

In [5]:
# predictions = []

# for i in train['item'].unique():
#     print(f"============== {i} Start =============")
    
#     for j in train[train['item']==i]['location'].unique():

#             train_sample = train[(train['item']==i)&(train['location']==j)].groupby('timestamp').median()
#             test_sample = test[(test['item']==i)&(test['location']==j)]
            
#             if len(test_sample) == 0: continue

#             train_x = train_sample.drop(columns=['supply(kg)'])
#             s = setup(train_x, target = 'price(원/kg)', 
#                 session_id = 123, 
#                 normalize = True, 
#                 normalize_method = 'minmax',
#                 train_size=0.6
#                 )

#             reg = compare_models()
#             test_sample_index = test_sample.index
#             prediction = predict_model(reg, data = pd.DataFrame(test_sample))
#             predictions.extend(prediction['prediction_label'])
        
#     print(f"============== {i} END =====-========")


In [6]:
# predictions = []

# for i in train['item'].unique():
#     print(f"============== {i} Start =============")
    
#     for j in train[train['item']==i]['location'].unique():

#         for k in train[(train['item']==i)&(train['location']==j)]['corporation'].unique():

#             train_sample = train[(train['item']==i)&(train['location']==j)&(train['corporation']==k)]
#             test_sample = test[(test['item']==i)&(test['location']==j)&(test['corporation']==k)]
            
#             if len(test_sample) == 0: continue

#             train_x = train_sample.drop(columns=['supply(kg)'])
#             s = setup(train_x, target = 'price(원/kg)', 
#                 session_id = 123, 
#                 fold = 5, 
#                 normalize = True, 
#                 normalize_method = 'minmax',
#                 train_size=0.8)

#             reg = compare_models()
#             test_sample_index = test_sample.index
#             prediction = predict_model(reg, data = pd.DataFrame(test_sample))
#             predictions.extend(prediction['prediction_label'])
        
#     print(f"============== {i} END =====-========")


In [7]:
# predictions = pd.DataFrame()

# for i in train['item'].unique():
#     print(f"============== {i} Start =============")
    
#     for j in train[train['item']==i]['location'].unique():
#         for k in train[(train['item']==i)&(train['location']==j)]['corporation'].unique():
#             train_sample = train[(train['item']==i)&(train['location']==j)&(train['corporation']==k)].groupby('timestamp').mean()
#             test_sample = test[(test['item']==i)&(test['location']==j)&(test['corporation']==k)].groupby('timestamp').mean()
            
#             if len(test_sample) == 0: continue

#             train_x = train_sample.drop(columns=['supply(kg)'])
#             s = setup(train_x, target = 'price(원/kg)', 
#                 session_id = 123, 
#                 fold = 5, 
#                 normalize = True, 
#                 normalize_method = 'minmax',
#                 train_size=0.9)

#             reg = compare_models()
#             test_sample_index = test_sample.index
#             prediction = predict_model(reg, data = pd.DataFrame(test_sample))
#             predictions = pd.concat([predictions, prediction])
        
#     print(f"============== {i} END =====-========")


In [8]:
result = pd.read_csv(test_path)
result['answer'] = predictions

del result['timestamp']
del result['item']
del result['corporation']
del result['location']

for i in range(len(result)):
    if (i-1) % 7 == 0: # 1, 8, 15, 22
        result.iloc[i,-1] = 0


In [9]:
result.to_csv('result4_case_by_case1.csv', index = False)
result

Unnamed: 0,ID,answer
0,TG_A_J_20230304,2664.613636
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,2666.686364
3,TG_A_J_20230307,2667.336364
4,TG_A_J_20230308,2674.990909
...,...,...
1087,RD_F_J_20230327,908.094107
1088,RD_F_J_20230328,953.246005
1089,RD_F_J_20230329,927.319589
1090,RD_F_J_20230330,929.900798
