In [1]:
# -*- coding: utf-8 -*-
"""
生产实例

推荐安装： pip install pytrends fredapi yfinance
使用许多实时公共数据源构建示例生产案例。

虽然此处显示了股价预测，但单独的时间序列预测并不是管理投资的推荐基础！

这是一种非常固执己见的方法。
evolution = True 允许时间序列自动适应变化。

然而，它存在陷入次优位置的轻微风险。
它可能应该与一些基本的数据健全性检查相结合。

cd ./AutoTS
conda activate py38
nohup python production_example.py > /dev/null &
"""
try:  # needs to go first
    from sklearnex import patch_sklearn

    patch_sklearn()
except Exception as e:
    print(repr(e))
import json
import datetime
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  # required only for graphs 
from autots import AutoTS, load_live_daily, create_regressor

fred_key = 'd84151f6309da8996e4f7627d6efc026'  # https://fred.stlouisfed.org/docs/api/api_key.html
gsa_key = 'c3bd622a-44c4-472c-92f7-de6f2423634f' # https://open.gsa.gov/api/dap/

forecast_name = "example"
graph = True  # 是否绘制图形
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
frequency = (
    "D"  # “infer”用于自动对齐，但特定偏移量最可靠，“D”是每日
)
forecast_length = 60  #  未来预测的周期数
drop_most_recent = 1  #  是否丢弃最近的n条记录（视为不完整）
num_validations = (
    2  # 交叉验证运行次数。 通常越多越好但速度越慢
)
validation_method = "backwards"  # "similarity", "backwards", "seasonal 364"
n_jobs = "auto"  # 或设置为CPU核心数
prediction_interval = (
    0.9  # 通过概率范围设置预测范围的上限和下限。 更大=更宽 Bigger = wider
)
initial_training = "auto"  # 在第一次运行时将其设置为 True，或者在重置时，'auto' 会查找现有模板，如果找到，则设置为 False。
evolve = True  # 允许时间序列在每次运行中逐步演化，如果为 False，则使用固定模板
archive_templates = True  # 保存使用时间戳的模型模板的副本
save_location = None  # "C:/Users/Colin/Downloads"  # 保存模板的目录。 默认为工作目录
template_filename = f"autots_forecast_template_{forecast_name}.csv"
forecast_csv_name = None  # f"autots_forecast_{forecast_name}.csv" 或 None，仅写入点预测
model_list = "scalable"
transformer_list = "fast"  # 'superfast'
transformer_max_depth = 5
models_mode = "default"  # "deep", "regressor"
initial_template = 'random'  # 'random' 'general+random'
preclean = None
{  # preclean option
    "fillna": 'ffill',
    "transformations": {"0": "EWMAFilter"},
    "transformation_params": {
        "0": {"span": 14},
    },
}
back_forecast = False
csv_load = False
start_time = datetime.datetime.now()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
if save_location is not None:
    template_filename = os.path.join(save_location, template_filename)
    if forecast_csv_name is not None:
        forecast_csv_name = os.path.join(save_location, forecast_csv_name)

if initial_training == "auto":
    initial_training = not os.path.exists(template_filename)
    if initial_training:
        print("No existing template found.")
    else:
        print("Existing template found.")

# 根据设置设置最大代数，增加速度会更慢，但获得最高准确度的机会更大
# 如果在 import_templates 中指定了 include_ensemble，则集成可以逐步嵌套几代
# if include_ensemble is specified in import_templates, ensembles can progressively nest over generations
if initial_training:
    gens = 100
    generation_timeout = 10000  # minutes
    models_to_validate = 0.15
    ensemble = ["horizontal-max", "dist" "simple"]  #  "mosaic" "mosaic-window", 'mlensemble'
elif evolve:
    gens = 500
    generation_timeout = 300  # minutes
    models_to_validate = 0.15
    ensemble = ["horizontal-max"]  # "mosaic", "mosaic-window", "subsample"
else:
    gens = 0
    generation_timeout = 60  # minutes
    models_to_validate = 0.99
    ensemble = ["horizontal-max", "dist", "simple"]  # "mosaic", "mosaic-window",

# 如果不进化，只保存最好的模型
if evolve:
    n_export = 50
else:
    n_export = 1  # > 1 不是一个坏主意，允许一些未来的适应性

No existing template found.


In [3]:
long: bool = False
observation_start: str = None
observation_end: str = None
# fred_key: str = None
fred_series=["DGS10", "T5YIE", "SP500", "DCOILWTICO", "DEXUSEU", "WPU0911"]
tickers: list = ["MSFT"]
trends_list: list = ["forecasting", "cycling", "microsoft"]
trends_geo: str = "US"
weather_data_types: list = ["AWND", "WSF2", "TAVG"]
weather_stations: list = ["USW00013960", "USW00014925"]
weather_years: int = 6
london_air_stations: list = ['CT3', 'SK8']
london_air_species: str = "PM25"
london_air_days: int = 700
earthquake_days: int = 700
earthquake_min_magnitude: int = 5
# gsa_key: str = 'c3bd622a-44c4-472c-92f7-de6f2423634f'  # https://open.gsa.gov/api/dap/
gov_domain_list=['nasa.gov']
gov_domain_limit: int = 600
wikipedia_pages: list = ['Microsoft_Office', "List_of_highest-grossing_films"]
wiki_language: str = "en"
weather_event_types=["%28Z%29+Winter+Weather", "%28Z%29+Winter+Storm"]
caiso_query: str = "ENE_SLRS"
timeout: float = 300.05
sleep_seconds: int = 15




earthquake_min_magnitude=5
weather_years=3
london_air_days=700
gov_domain_list=None
gov_domain_limit=700
sleep_seconds=15

In [4]:
"""
Begin dataset retrieval 翻译：开始数据集检索
"""
import os
# 设置代理
os.environ['HTTP_PROXY'] = "http://127.0.0.1:10809"
os.environ['HTTPS_PROXY'] = "http://127.0.0.1:10809"

if not csv_load:
    fred_series = [
        "DGS10",
        "T5YIE",
        "SP500",
        "DCOILWTICO",
        "DEXUSUK",
        "DEXUSEU",
        "BAMLH0A0HYM2",
        "DAAA",
        "T10Y2Y",
    ]
    tickers = ["MSFT", "PG"] # 
    trend_list = ["forecasting", "msft", "p&g"]
    weather_event_types = ["%28Z%29+Winter+Weather", "%28Z%29+Winter+Storm"]
    wikipedia_pages = ['all', 'Microsoft', "Procter_%26_Gamble", "YouTube", "United_States"]


## 下载数据，跳过

In [None]:
assert sleep_seconds >= 0.5, "sleep_seconds must be >=0.5"

dataset_lists = []
if observation_end is None:
    current_date = datetime.datetime.utcnow()
else:
    current_date = observation_end
if observation_start is None:
    # should take from observation_end but that's expected as a string
    observation_start = datetime.datetime.utcnow() - datetime.timedelta(
        days=365 * 6
    )
    observation_start = observation_start.strftime("%Y-%m-%d")
try:
    import requests

    s = requests.Session()
except Exception as e:
    print(f"requests Session creation failed {repr(e)}")

if fred_key is not None and fred_series is not None:
    from autots.datasets.fred2 import Fred  # noqa
    from autots.datasets.fred import get_fred_data

    fred_df = get_fred_data(
        fred_key,
        fred_series,
        long=False,
        observation_start=observation_start,
        sleep_seconds=sleep_seconds,
    )
    # fred_df.index = fred_df.index.tz_localize(None)
    # dataset_lists.append(fred_df)

# 定义current_date

In [5]:
dataset_lists = []
if observation_end is None:
    current_date = datetime.datetime.utcnow()
else:
    current_date = observation_end
if observation_start is None:
    # should take from observation_end but that's expected as a string
    observation_start = datetime.datetime.utcnow() - datetime.timedelta(
        days=365 * 6
    )
    observation_start = observation_start.strftime("%Y-%m-%d")

try:
    import requests

    s = requests.Session()
except Exception as e:
    print(f"requests Session creation failed {repr(e)}")

In [6]:
# # 从fred_data.csv 读取数据到dataset_lists
# fred_df = pd.read_csv('fred_data.csv', index_col=0)
import pickle
with open('dataset_lists.pkl', 'rb') as f:
    dataset_lists = pickle.load(f)

In [8]:
# dataset_lists = []
# dataset_lists.append(fred_df) # 加入数据集列表
# 输出list列表前五行
print(dataset_lists[0].head())
# 打印数据集的形状
print(dataset_lists[0].shape)
# 打印数据集的数量
print(len(dataset_lists))

            DGS10  T5YIE    SP500  DCOILWTICO  DEXUSUK  DEXUSEU  BAMLH0A0HYM2  \
2018-01-25   2.63   1.91  2839.25       65.62   1.4264   1.2488          3.28   
2018-01-26   2.66   1.93  2872.87       66.27   1.4179   1.2422          3.23   
2018-01-29   2.70   1.92  2853.53       65.71   1.4042   1.2352          3.26   
2018-01-30   2.73   1.97  2822.43       64.64   1.4124   1.2390          3.33   
2018-01-31   2.72   1.98  2823.81       64.82   1.4190   1.2428          3.29   

            DAAA  T10Y2Y  
2018-01-25  3.55    0.55  
2018-01-26  3.58    0.53  
2018-01-29  3.59    0.59  
2018-01-30  3.63    0.60  
2018-01-31  3.59    0.58  
(1584, 9)
4


In [8]:
from functools import reduce

# 首先确保所有数据集的索引都转换为统一的日期时间格式
dataset_lists = [dataset.set_index(pd.to_datetime(dataset.index)) for dataset in dataset_lists]


df = reduce(
    lambda x, y: pd.merge(x, y, left_index=True, right_index=True, how="outer"), # 合并数据集
    dataset_lists,
)
print(f"{df.shape[1]} series downloaded.")
s.close()
df.index.name = "datetime"

26 series downloaded.


In [9]:
df_long = df.reset_index(drop=False).melt(
    id_vars=['datetime'], var_name='series_id', value_name='value'
)

In [10]:
if tickers is not None:
    for fx in tickers:
        for suffix in ["_high", "_low", "_open", "_close"]:
            fxs = (fx + suffix).lower()
            if fxs in df.columns:
                df[fxs] = df[fxs].interpolate('akima')


In [11]:
if fred_series is not None:
    for fx in fred_series:
        if fx in df.columns:
            df[fx] = df[fx].interpolate('akima')

In [12]:
df = df.ffill(limit=3) # 填充缺失值

In [13]:
df = df[df.index <= start_time]

In [14]:
df = df[[x for x in df.columns if "_volume" not in x]]
# 取消股息和股票分割，因为它会扭曲指标
df = df[[x for x in df.columns if "_dividends" not in x]]
df = df[[x for x in df.columns if "stock_splits" not in x]]

In [15]:
df.to_csv(f"training_data_{forecast_name}.csv")

In [16]:
frequency: str = "infer"
holiday_countries: list = ["CN"]
datepart_method: str = "simple_binarized"
drop_most_recent: int = 0
scale: bool = True
summarize: str = "auto"
backfill: str = "bfill"
n_jobs: str = "auto"
fill_na: str = 'ffill'
aggfunc: str = "first"
encode_holiday_type=False
holiday_detector_params={
    "threshold": 0.8,
    "splash_threshold": None, # 设定一个界限，用于区分显著的假日效应
    "use_dayofmonth_holidays": True, # 月份中固定日期的假日
    "use_wkdom_holidays": True, # 月初的工作日假期
    "use_wkdeom_holidays": False, # 月底的工作日假期
    "use_lunar_holidays": True, # 农历假期，如春节
    "use_lunar_weekday": False, # 农历的工作日
    "use_islamic_holidays": False, # 伊斯兰假期
    "use_hebrew_holidays": False, # 希伯来假期（例如，犹太新年）
    "output": 'univariate',
    "anomaly_detector_params": { # 异常检测器参数
        "method": "mad", # 异常检测方法，例如"mad"代表中位数绝对偏差
        "transform_dict": {
            "fillna": None,
            "transformations": {"0": "DifferencedTransformer"},
            "transformation_params": {"0": {}},
        },
        "forecast_params": None,
        "method_params": {"distribution": "gamma", "alpha": 0.05},
    },
},
holiday_regr_style: str = "flag"
preprocessing_params: dict = None

In [17]:
from autots.tools.shaping import infer_frequency
if frequency == "infer": # 从数据中推断频率
    frequency = infer_frequency(df)

In [18]:
df = df.resample(frequency).first() # 重采样

In [19]:
for col in df.columns: #尝试将所有列转换为数值类型
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        pass

In [20]:
df = df.select_dtypes(include=np.number)  # 选择数值类型的列
dates = df.index # 获取索引
df_cols = df.columns
df_inner = df.copy()

if scale: # 如果为真，使用 StandardScaler 标准化特征
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    df_inner = pd.DataFrame(
        scaler.fit_transform(df_inner), index=dates, columns=df_cols
    )


from autots.tools.impute import FillNA
 # 填充假期的空数据（前面已新建假期索引日期），然后数据降维方法选择
df_inner = FillNA(df_inner, method=fill_na)
ag_flag = False

In [21]:
# df_inner 压缩维度到15维
from sklearn.cluster import FeatureAgglomeration

n_clusters = 10 if ag_flag else 15
if df_inner.shape[1] > 15:
    df_inner = pd.DataFrame(
        FeatureAgglomeration(n_clusters=n_clusters).fit_transform(df_inner),
        index=dates,
    )

In [22]:
# tail() 返回最后n行
regressor_forecast = df_inner.tail(forecast_length)

# 重置索引，将开始日期设置为数据最后一天，依次往后延申
regressor_forecast.index = pd.date_range(
    dates[-1], periods=(forecast_length + 1), freq=frequency
)[1:]

# 数据时间索引整体推迟60天
regressor_train = df_inner.shift(forecast_length)

# 通过先向后填充再向前填充的方式来处理regressor_train中的任何缺失值
regressor_train = regressor_train.bfill().ffill()

regr_train = regressor_train.copy()
regr_fcst = regressor_forecast.copy()

In [23]:
from autots.tools.seasonal import date_part
# datepart
if datepart_method is not None:
    regr_train = pd.concat(
        [regr_train, date_part(regr_train.index, method=datepart_method)],
        axis=1,
    )
    regr_fcst = pd.concat(
        [regr_fcst, date_part(regr_fcst.index, method=datepart_method)],
        axis=1,
    )

In [28]:
df = df.iloc[forecast_length:]

In [None]:
regr_train = regr_train.iloc[forecast_length:]