# Load libraries & data

## Import 

In [1]:
import pandas as pd
import numpy as np
import os
import random
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

## set seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load data 

In [3]:
path = "/mnt/d/data/jeju"

train_df = pd.read_csv(os.path.join(path, './train.csv'))
test_df = pd.read_csv(os.path.join(path, './test.csv'))

# Preprocess

In [4]:
new_train = train_df[['ID','timestamp','supply(kg)', 'price(원/kg)']]
new_test = test_df[['ID','timestamp']]

new_train['item_id'] = new_train.ID.str[0:6]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['item_id'] = new_train.ID.str[0:6]


# Train autogluon

In [7]:
new_train

Unnamed: 0,ID,timestamp,supply(kg),price(원/kg),item_id
0,TG_A_J_20190101,2019-01-01,0.0,0.0,TG_A_J
1,TG_A_J_20190102,2019-01-02,0.0,0.0,TG_A_J
2,TG_A_J_20190103,2019-01-03,60601.0,1728.0,TG_A_J
3,TG_A_J_20190104,2019-01-04,25000.0,1408.0,TG_A_J
4,TG_A_J_20190105,2019-01-05,32352.0,1250.0,TG_A_J
...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,452440.0,468.0,RD_F_J
59393,RD_F_J_20230228,2023-02-28,421980.0,531.0,RD_F_J
59394,RD_F_J_20230301,2023-03-01,382980.0,574.0,RD_F_J
59395,RD_F_J_20230302,2023-03-02,477220.0,523.0,RD_F_J


In [9]:
predictor.leaderboard(new_train, silent=True)

Additional data provided, testing on additional data. Resulting leaderboard will be sorted according to test score (`score_test`).


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-705.743037,-709.940939,1.555315,5.403187,1.097517,7
1,DeepAR,-720.781044,-728.319343,0.844896,0.764221,155.688102,6
2,AutoETS_FULL,-797.059173,,1.770588,,0.000163,11
3,AutoETS,-797.059173,-797.059173,19.861295,10.688277,0.029049,4
4,RecursiveTabular,-810.212523,-810.212523,0.658655,1.068097,11.392555,5
5,RecursiveTabular_FULL,-825.170948,,0.575578,,2.275115,12
6,Theta_FULL,-838.834688,,1.861221,,0.000122,10
7,Theta,-838.834688,-838.834688,25.85018,20.429361,0.027956,3
8,SeasonalNaive_FULL,-843.925825,,0.054764,,0.000228,9
9,SeasonalNaive,-843.925825,-843.925825,0.054855,0.042894,0.032684,2


In [5]:
data = TimeSeriesDataFrame(new_train.drop(columns=['ID']))
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)
# seed 고정
predictor.fit(data)

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
Global seed set to 42
AutoGluon will save models to AutogluonModels/ag-20231114_133632
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['supply(kg)']

Starting training. Start time is 2023-11-14 22:36:32
Models that will be trained: ['Naive',

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f6db6aa34c0>

In [6]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	2.28    s     = Training runtime


In [10]:
# seed 고정
pred = predictor.predict(data)

Global seed set to 123
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [11]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3244.468938,1636.733881,2257.250796,2667.087394,3000.819809,3294.884832,3579.818507,3887.724872,4240.696968,4728.825943
TG_A_J,2023-03-05,489.230323,-1034.231322,-510.992127,-134.131960,187.419483,488.070502,789.419198,1111.829071,1489.190057,2013.669971
TG_A_J,2023-03-06,3210.904281,606.836084,1611.494988,2268.948102,2795.663643,3277.361800,3734.604147,4245.924059,4879.736791,5854.006658
TG_A_J,2023-03-07,3494.387941,664.325787,1742.157711,2418.036963,2990.243613,3514.855593,4013.353803,4565.900419,5260.805794,6291.212425
TG_A_J,2023-03-08,3401.077983,399.305990,1442.999626,2237.743145,2845.768088,3413.253154,3999.851256,4635.811624,5322.209073,6414.553955
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,517.827167,-41.095563,207.051938,336.468585,444.253454,533.713062,621.092043,717.667318,839.469310,1047.080754
RD_F_J,2023-03-28,530.396600,-47.472243,224.495850,361.098689,459.692649,551.558953,647.959133,753.055656,886.747392,1061.512284
RD_F_J,2023-03-29,553.831353,-44.464600,211.281755,362.149569,468.921237,570.368951,668.279556,770.641617,897.742616,1113.990778
RD_F_J,2023-03-30,542.249102,-157.934639,193.465837,343.008614,456.346733,557.264422,658.351088,762.994322,894.684040,1091.728241


In [12]:
submission = pd.read_csv(os.path.join(path, './sample_submission.csv'))
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.to_csv('./dacon_submission.csv', index=False)
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3244.468938
1,TG_A_J_20230305,489.230323
2,TG_A_J_20230306,3210.904281
3,TG_A_J_20230307,3494.387941
4,TG_A_J_20230308,3401.077983
...,...,...
1087,RD_F_J_20230327,517.827167
1088,RD_F_J_20230328,530.396600
1089,RD_F_J_20230329,553.831353
1090,RD_F_J_20230330,542.249102
