# Load libraries & data

## Import 

In [1]:
import pandas as pd
import numpy as np
import os
import random
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

## set seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load data 

In [3]:
path = "/mnt/d/data/jeju"

train_df = pd.read_csv(os.path.join(path, './train.csv'))
test_df = pd.read_csv(os.path.join(path, './test.csv'))

# Preprocess

In [4]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [7]:
new_train = train_df[['ID','timestamp','supply(kg)', 'price(원/kg)', "year", "month", "day"]]
new_test = test_df[['ID','timestamp', "year", "month", "day"]]

new_train['item_id'] = new_train.ID.str[0:6]
new_test['item_id'] = new_test.ID.str[0:6]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['item_id'] = new_train.ID.str[0:6]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test['item_id'] = new_test.ID.str[0:6]


# Train autogluon

In [140]:
data = TimeSeriesDataFrame(new_train.drop(columns=['ID']))
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
     known_covariates_names = ["year", "month", "day"],
    # ignore_time_index=True,
)
# seed 고정
predictor.fit(data, num_val_windows=3)

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 3,
 'prediction_length': 28,
 'random_seed': None,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
AutoGluon will save models to AutogluonModels/ag-20231117_142708
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	known covariates: ['year', 'month', 'day']
	past covariates:  ['supply(kg)']

Starting training. Start time is 2023-11-17 23:27:08
Models that wi

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f3de41bc7f0>

In [141]:
predictor.leaderboard()

              model   score_val  pred_time_val  fit_time_marginal  fit_order
0  WeightedEnsemble -752.286948       4.218859           1.529638          7
1           AutoETS -786.208400       1.550855           3.240808          4
2  RecursiveTabular -838.438581       0.677331           6.014990          5
3             Theta -839.097795       1.910348           3.513313          3
4     SeasonalNaive -872.164353       0.038748           0.164965          2
5             Naive -975.103009       0.041576           0.180290          1
6            DeepAR -993.864172       0.916366         879.382754          6


Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-752.286948,4.218859,1.529638,7
1,AutoETS,-786.2084,1.550855,3.240808,4
2,RecursiveTabular,-838.438581,0.677331,6.01499,5
3,Theta,-839.097795,1.910348,3.513313,3
4,SeasonalNaive,-872.164353,0.038748,0.164965,2
5,Naive,-975.103009,0.041576,0.18029,1
6,DeepAR,-993.864172,0.916366,879.382754,6


In [8]:
predictor.refit_full(model="all")

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	12.38   s     = Training runtime
Fitting model: DeepAR_FULL | Skipping fit via cloning parent ...
Fitting model: Naive_FULL_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FUL

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'DeepAR': 'DeepAR_FULL',
 'Naive_FULL': 'Naive_FULL_FULL',
 'SeasonalNaive_FULL': 'SeasonalNaive_FULL_FULL',
 'Theta_FULL': 'Theta_FULL_FULL',
 'AutoETS_FULL': 'AutoETS_FULL_FULL',
 'RecursiveTabular_FULL': 'RecursiveTabular_FULL_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [142]:
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe

future_index = get_forecast_horizon_index_ts_dataframe(data, prediction_length=28)

future_timestamps = future_index.get_level_values("timestamp")
future_index.names = ["item_ids", "item_ids"]

known_covariates = pd.DataFrame(index=future_index)
known_covariates['year'] = np.array(future_timestamps.to_series().dt.year)
known_covariates['month'] = np.array(future_timestamps.to_series().dt.month)
known_covariates['day'] = np.array(future_timestamps.to_series().dt.day)


In [143]:
test_data = TimeSeriesDataFrame(new_test)

In [144]:
pred = predictor.predict(data, known_covariates= test_data)

Global seed set to 123
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [145]:
submission = pd.read_csv(os.path.join(path, './sample_submission.csv'))
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.to_csv('./dacon_submission.csv', index=False)
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3094.495123
1,TG_A_J_20230305,843.988308
2,TG_A_J_20230306,3076.601860
3,TG_A_J_20230307,3381.481404
4,TG_A_J_20230308,3199.946484
...,...,...
1087,RD_F_J_20230327,518.467955
1088,RD_F_J_20230328,538.119704
1089,RD_F_J_20230329,552.207454
1090,RD_F_J_20230330,524.074828
