In [1]:
# !pip install neuralforecast pandas==2.2.1

In [2]:
from sqlalchemy import create_engine
import pandas as pd
import torch

# 创建数据库引擎
database_url = "sqlite:///../data/data.sqlite"
engine = create_engine(database_url)

# define hyper paras
use_standard_scaler = True
save_imputed_data = False
notebook_mode = 'train'
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

model_name = 'Autoformer'
model_version = '1.0.0'
saved_table = 'sh_customs_daily_pred'

In [3]:
query = "select date as ds,global_entry,hkmo_entry,tw_entry from sh_customs_daily_imputed"
data = pd.read_sql_query(query, engine)
original_data = data.copy()

In [4]:
# melting,转换为长格式
df_long = pd.melt(data, id_vars=['ds'], var_name='unique_id', value_name='y')
df_long['ds'] = pd.to_datetime(df_long.ds)

## 使用cross validation选择模型

使用RTX 4070Ti GPU 训练3小时

In [5]:
# from neuralforecast.utils import augment_calendar_df
# 
# df_augmented, calendar_cols = augment_calendar_df(df=df_long, freq='D')
# df_augmented.head()

In [6]:
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [7]:
import pandas as pd

from neuralforecast import NeuralForecast
from neuralforecast.models.autoformer import Autoformer

horizon = 30  # a month
model = Autoformer(h=horizon,
                   input_size=5 * horizon,  # 3 month
                   hidden_size=256,
                   conv_hidden_size=128,
                   n_head=8,
                   scaler_type='minmax1',
                   learning_rate=1e-3,
                   max_steps=1000,
                   val_check_steps=100,
                   windows_batch_size=128,
                   early_stop_patience_steps=3,
                   random_seed=42,
                   )

nf = NeuralForecast(
    models=[model],
    freq='D',
)
nf.fit(df=df_long, val_size=90)


Seed set to 42


Epoch 99: 100%|██████████| 1/1 [00:00<00:00,  6.21it/s, v_num=5, train_loss_step=0.369, train_loss_epoch=0.412]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  6.77it/s][A
Epoch 199: 100%|██████████| 1/1 [00:00<00:00,  6.19it/s, v_num=5, train_loss_step=0.343, train_loss_epoch=0.404, valid_loss=930.0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  6.73it/s][A
Epoch 299: 100%|██████████| 1/1 [00:00<00:00,  6.25it/s, v_num=5, train_loss_step=0.383, train_loss_epoch=0.384, valid_loss=869.0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          

In [None]:
import matplotlib.pyplot as plt

train = nf.models[0].train_trajectories
valid = nf.models[0].valid_trajectories

# Creating DataFrames for easy plotting
df_train_loss = pd.DataFrame(train, columns=['Epoch', 'Train Loss'])
df_valid_loss = pd.DataFrame(valid, columns=['Epoch', 'Validation Loss'])

# Plotting
plt.plot(df_train_loss['Epoch'], df_train_loss['Train Loss'], label='Train Loss')
plt.plot(df_valid_loss['Epoch'], df_valid_loss['Validation Loss'], label='Validation Loss')
plt.title("Training and Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [9]:
df_insample = nf.predict_insample(step_size=1)
df_insample

Predicting DataLoader 0: 100%|██████████| 1/1 [00:11<00:00,  0.09it/s]


Unnamed: 0,unique_id,ds,cutoff,Autoformer,y
0,global_entry,2011-01-01,2010-12-31,-inf,11430.0
1,global_entry,2011-01-02,2010-12-31,-inf,11460.0
2,global_entry,2011-01-03,2010-12-31,-inf,12266.0
3,global_entry,2011-01-04,2010-12-31,-inf,12951.0
4,global_entry,2011-01-05,2010-12-31,-inf,12751.0
...,...,...,...,...,...
427495,tw_entry,2024-01-27,2024-01-01,2416.991211,2426.0
427496,tw_entry,2024-01-28,2024-01-01,2275.520752,2015.0
427497,tw_entry,2024-01-29,2024-01-01,2425.713379,1876.0
427498,tw_entry,2024-01-30,2024-01-01,2594.325439,1599.0


In [None]:
df_wide = df_insample.pivot_table(index='ds', columns='unique_id', values='y').reset_index().set_index('ds')
df_wide_autoformer = df_insample.pivot_table(index='ds', columns='unique_id',
                                             values='Autoformer').reset_index().set_index('ds')

fig, ax1 = plt.subplots()

# 绘制第一个 DataFrame
color = 'tab:red'
ax1.set_xlabel('Date')
ax1.set_ylabel('Value1', color=color)
ax1.plot(df_wide.iloc[550:600, :], color=color)
ax1.tick_params(axis='y', labelcolor=color)

# 实例化一个与ax1共享x轴的第二个坐标轴
ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Value2', color=color)
ax2.plot(df_wide_autoformer.iloc[550:600, :], color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # 为了让图表看起来更整洁
plt.show()

In [None]:
import numpy as np
import pandas as pd

last_date = pd.to_datetime(data['ds']).max()

# 生成未来半年的日期序列，从最后一个日期的第二天开始
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=90, freq='D')

# 创建一个新的DataFrame，包含未来的日期和原始DataFrame相同的列结构，其他列填充为NaN
# 注意：这里我们利用`.reindex`来扩展DataFrame，并填充NaN值
future_df = pd.DataFrame(index=future_dates, columns=data.columns)
# 将新生成的日期列命名为'ds'
future_df['ds'] = future_df.index
future_df_long = pd.melt(future_df, id_vars=['ds'], var_name='unique_id', value_name='y')
future_df_long['ds'] = pd.to_datetime(df_long.ds)
preds, futr_calendar_cols = augment_calendar_df(df=future_df_long, freq='D')
preds.drop(columns=['y'], inplace=True)
preds

In [None]:
nf.predict(df_long, futr_df=preds, step_size=1, random_seed=42)