In [None]:
# pip install -U pip
# C:\ProgramData\anaconda3\envs\asac\python.exe -m pip install -U pip
# pip install -U setuptools wheel
# pip install torch==1.13.1+cpu torchvision==0.14.1+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
# pip install autogluon

In [None]:
import autogluon.core as ag
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [None]:
import os
import time
import random
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
now = time
year = now.localtime().tm_year
mon = now.localtime().tm_mon
day = now.localtime().tm_mday
hour = now.localtime().tm_hour
min = now.localtime().tm_min
sec = now.localtime().tm_sec


# start = (year,'/',mon,'/',day,' ',hour,':',min,':',sec)
start = year, mon, day, hour, min, sec
print(start)

## Fixed Random-Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Load Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df = train_df.rename(columns={
    'price(원/kg)': 'price(KRW/kg)'
})

train_df.head()

In [None]:
train_df = TabularDataset(train_df)
test_df = TabularDataset(test_df)

## Data Pre-Processing

In [None]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눔
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))


# 요일 추가
train_df['date'] = pd.to_datetime(train_df[['year', 'month', 'day']])
train_df['dayofweek'] = (train_df['date'].dt.dayofweek + 1) % 7
train_df = train_df.drop(columns=['date'])

test_df['date'] = pd.to_datetime(test_df[['year', 'month', 'day']])
test_df['dayofweek'] = (test_df['date'].dt.dayofweek + 1) % 7
test_df = test_df.drop(columns=['date'])


# 쉬는날(일요일) 추가
train_df.loc[(train_df['dayofweek'] == 0), 'sun day'] = 1
test_df.loc[(test_df['dayofweek'] == 0), 'sun day'] = 1


# 월 전반/후반 추가
train_df.loc[(train_df['day'] <= 15), 'half'] = 1
train_df.loc[(train_df['day'] > 15), 'half'] = 2
test_df.loc[(test_df['day'] <= 15), 'half'] = 1
test_df.loc[(test_df['day'] > 15), 'half'] = 2


# 월 1주 2주 3주 마지막주 추가
train_df.loc[(train_df['day'] <= 7), 'week_count'] = 1
train_df.loc[(7 < train_df['day']) & (train_df['day'] <= 14), 'week_count'] = 2
train_df.loc[(14 < train_df['day']) & (train_df['day'] <= 21), 'week_count'] = 3
train_df.loc[(21 < train_df['day']), 'half'] = 4

test_df.loc[(test_df['day'] <= 7), 'week_count'] = 1
test_df.loc[(7 < test_df['day']) & (test_df['day'] <= 14), 'week_count'] = 2
test_df.loc[(14 < test_df['day']) & (test_df['day'] <= 21), 'week_count'] = 3
test_df.loc[(21 < test_df['day']), 'week_count'] = 4


# 나머지 결측값을 0으로 채우기
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)


#학습에 사용하지 않을 변수들을 제거
train_df = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)'])
test_df = test_df.drop(columns=['ID', 'timestamp'])


# 라벨 인코딩
qual_col = ['item', 'corporation', 'location']


for i in qual_col:
    le = LabelEncoder()
    train_df[i]=le.fit_transform(train_df[i])
    test_df[i]=le.transform(test_df[i])


print('Done.')

In [None]:
train_df.head(40)

In [None]:
test_df.head(40)

In [None]:
# predictor = TabularPredictor(label = 'price(KRW/kg)', problem_type = 'regression', eval_metric = 'mae'
#                             ).fit(train_df,
#                                   presets = 'best_quality',
#                                   num_stack_levels = 5,
#                                   # time_limit = 3600 * 6, num_gpus=1)
#                                   time_limit = 6, num_gpus=1)

predictor = TabularPredictor(label = 'price(KRW/kg)', problem_type = 'regression', eval_metric = 'mae',
                             path = './AutogluonModels/fold10_bag_30_stack_3')

predictor.fit(train_data = train_df, presets = 'best_quality',
              auto_stack = True, fit_weighted_ensemble = True,
              num_bag_folds = 10, num_bag_sets = 30, num_stack_levels = 3,
              num_gpus = 1, num_cpus = 4, verbosity = 2)
              # num_gpus = 1, num_cpus = 24, verbosity = 2)

In [None]:
# print(predictor.leaderboard(silent = True))

# Leaderboard 확인
leaderboard = predictor.leaderboard(train_df, silent = True, extra_metrics = ['rmse', 'mape'])
leaderboard.head(10)

In [None]:
# Feature Importance 확인
feature_importance = predictor.feature_importance(train_df)
feature_importance

In [None]:
submission = pd.read_csv('sample_submission.csv')
pred = predictor.predict(test_df)

In [None]:
submission['answer'] = pred.round(2)
submission.to_csv('autogluon-final.csv', index = False)
display(submission)