In [3]:
import pandas as pd

# 数据加载
df = pd.read_csv('./sales.csv', encoding='gbk')
df

Unnamed: 0,时间,销售额（万元）
0,2004年1月,740
1,2004年2月,745
2,2004年3月,746
3,2004年4月,780
4,2004年5月,784
5,2004年6月,785
6,2004年7月,793
7,2004年8月,792
8,2004年9月,785
9,2004年10月,782


In [4]:
df.columns = ['time', 'amount']

In [6]:
df.time = pd.to_datetime(df.time, format='%Y年%m月')
df.index = df.time

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import warnings
from itertools import product
from datetime import datetime, timedelta
import calendar

warnings.filterwarnings('ignore')

# 设置参数范围
ps = range(0, 5)
qs = range(0, 5)
ds = range(1, 2) # [1, 2) = 1
parameters = product(ps, ds, qs)
parameters_list = list(parameters)
# 寻找最优ARMA模型参数，即best_aic最小
results = []
best_aic = float("inf") # 正无穷
for param in parameters_list:
    try:
        # SARIMAX 包含季节趋势因素的ARIMA模型
        model = sm.tsa.statespace.SARIMAX(df.amount,
                                order=(param[0], param[1], param[2]),
                                #seasonal_order=(4, 1, 2, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False).fit()

    except ValueError:
        print('参数错误:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])
# 输出最优模型
print('最优模型: ', best_model.summary())

最优模型:                                 SARIMAX Results                                
Dep. Variable:                 amount   No. Observations:                   42
Model:               SARIMAX(1, 1, 4)   Log Likelihood                -125.697
Date:                Sun, 17 Apr 2022   AIC                            263.393
Time:                        19:43:17   BIC                            272.894
Sample:                    01-01-2004   HQIC                           266.709
                         - 06-01-2007                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9973      0.043     23.431      0.000       0.914       1.081
ma.L1         -0.7513   1293.221     -0.001      1.000   -2535.419    2533.916
ma.L2         -0.3617    321.492     -0.001  

In [13]:
# 设置future_month，需要预测的时间date_list
df2 = df[['amount']]
future_month = 4
last_month = pd.to_datetime(df2.index[len(df2)-1])
date_list = []
for i in range(future_month):
    # 计算下个月有多少天
    year = last_month.year
    month = last_month.month
    """
    if month == 12:
        month = 1
        year = year+1
    else:
        month = month + 1
    """
    next_month_days = calendar.monthrange(year, month)[1]
    #print(next_month_days)
    last_month = last_month + timedelta(days=next_month_days)
    date_list.append(last_month)
print('date_list=', date_list)

# 添加未来要预测的4个月
future = pd.DataFrame(index=date_list, columns= df.columns)
df2 = pd.concat([df, future])

df2

date_list= [Timestamp('2007-07-01 00:00:00'), Timestamp('2007-08-01 00:00:00'), Timestamp('2007-09-01 00:00:00'), Timestamp('2007-10-01 00:00:00')]


Unnamed: 0,time,amount
2004-01-01,2004-01-01,740.0
2004-02-01,2004-02-01,745.0
2004-03-01,2004-03-01,746.0
2004-04-01,2004-04-01,780.0
2004-05-01,2004-05-01,784.0
2004-06-01,2004-06-01,785.0
2004-07-01,2004-07-01,793.0
2004-08-01,2004-08-01,792.0
2004-09-01,2004-09-01,785.0
2004-10-01,2004-10-01,782.0


In [11]:
len(df2)

46

In [14]:
# get_prediction得到的是区间，使用predicted_mean
df2['forecast'] = best_model.get_prediction(start=0, end=len(df2)).predicted_mean
df2

Unnamed: 0,time,amount,forecast
2004-01-01,2004-01-01,740.0,0.0
2004-02-01,2004-02-01,745.0,1109.018905
2004-03-01,2004-03-01,746.0,749.9974
2004-04-01,2004-04-01,780.0,747.002591
2004-05-01,2004-05-01,784.0,813.905908
2004-06-01,2004-06-01,785.0,787.990785
2004-07-01,2004-07-01,793.0,786.984179
2004-08-01,2004-08-01,792.0,798.96732
2004-09-01,2004-09-01,785.0,792.758312
2004-10-01,2004-10-01,782.0,785.759196


In [None]:
方法1：时间序列 => 快 baseline => ARIMA
方法2：回归分析 => 多变量 （time, 活动次数，商品个数，促销预算，居民购买力）
     => 树模型 XGBoost, LightGBM; 神经网络 LSTM, MLP