In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import *
from statsmodels.graphics.tsaplots import *
from catboost import *
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.datasets import load_diabetes
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import *
from sklearn.ensemble import *
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.metrics import *
import pickle
import tensorflow as tf
import shap
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import *
import os

# tf.compat.v1.disable_v2_behavior()
forceCpu = False
# 使用CPU
if forceCpu:
    cpu = tf.config.list_physical_devices("CPU")
    tf.config.set_visible_devices(cpu)
    print(tf.config.list_logical_devices())

# 动态显存
if not forceCpu:
    physical_devices = tf.config.list_physical_devices('GPU')
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    except:
        print('Invalid device or cannot modify virtual devices once initialized')

plt.rcParams['font.sans-serif'] = ['SimHei']  # 黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决无法显示符号的问题
palette = 'deep'
sns.set(font='SimHei', font_scale=1.2, style='whitegrid', palette=palette)  # 解决Seaborn中文显示问题

rawDataPiovt = pd.read_excel('../../../preprocess/均值填充-物流网络历史货量数据.xlsx')
# 宽表转长表
rawData = pd.melt(rawDataPiovt, id_vars=['场地1', '场地2'], var_name='日期', value_name='货量')
rawData

Unnamed: 0,场地1,场地2,日期,货量
0,DC1,DC8,2021-01-01,3
1,DC10,DC12,2021-01-01,306
2,DC10,DC13,2021-01-01,3
3,DC10,DC14,2021-01-01,2613
4,DC10,DC17,2021-01-01,2
...,...,...,...,...
765765,DC9,DC58,2022-12-31,1
765766,DC9,DC62,2022-12-31,2
765767,DC9,DC67,2022-12-31,44
765768,DC9,DC79,2022-12-31,2


In [4]:
def getxy():
    targetKey = '货量'
    # 对场地进行编码

    df = rawData.copy()
    # df = df[(df['场地1'] == 'DC14') & (df['场地2'] == 'DC10')].reset_index(drop=True)
    df['场地1'] = df['场地1'].str.replace('DC', '')
    df['场地1'] = df['场地1'].astype('int64')
    df['场地2'] = df['场地2'].str.replace('DC', '')
    df['场地2'] = df['场地2'].astype('int64')

    df['日期'] = pd.to_datetime(df['日期'])
    df['日期'] = df['日期'] - df['日期'].min()
    df['日期'] = df['日期'].apply(lambda x: x.days)

    df['货量'] = np.log(df['货量'])
    return df.drop(targetKey, axis=1), df[targetKey], df

In [5]:
_, _, data = getxy()


In [6]:
filterSize = 5


# 将数据处理成模型可接受的形式
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        # a = dataset[i:(i + look_back), :-1]
        # 自回归
        a = dataset[i:(i + look_back), :]
        dataX.append(a)
        dataY.append(dataset[i + look_back, -1])
    return np.array(dataX), np.array(dataY)


# 按照时间排序
data = data.sort_values('日期')

# 将每条有向边转换成一个序列
dataset = []
for _, group in data.groupby(['场地1', '场地2']):
    dataset.append(group.values)

print(f'dataset数量: {len(dataset)}')

# 将序列转换成模型可接受的形式
'''
`look_back`是一个超参数，它定义了我们在创建时间序列数据集时要考虑多少个时间步。
具体地说，对于每条有向边的货量时间序列，我们将数据集中的每个样本定义为过去`look_back`个时间步的货量，目标是预测下一个时间步的货量。

例如，
如果`look_back`设置为1，我们将使用过去1天的货量数据来预测下一天的货量。
如果`look_back`设置为3，我们将使用过去3天的货量数据来预测下一天的货量。
通过调整`look_back`，我们可以控制模型应该考虑多少历史数据来进行预测。
'''
look_back = filterSize - 1
trainX, trainY = [], []
for i in range(len(dataset)):
    train_x, train_y = create_dataset(dataset[i], look_back)
    trainX.append(train_x)
    trainY.append(train_y)

trainX, trainY = np.concatenate(trainX), np.concatenate(trainY)

print(f'trainX.shape: {trainX.shape}')
print(f'trainY.shape: {trainY.shape}')
# 和源数据177847相差3447

dataset数量: 1049
trainX.shape: (761574, 4, 4)
trainY.shape: (761574,)


In [13]:
modelPath = 'LSTM.1.h5'


# 定义自定义指标函数
def r_square(y_true, y_pred):
    SS_res = tf.keras.backend.sum(tf.keras.backend.square(y_true - y_pred))
    SS_tot = tf.keras.backend.sum(tf.keras.backend.square(y_true - tf.keras.backend.mean(y_true)))
    return (1 - SS_res / (SS_tot + tf.keras.backend.epsilon()))


model = tf.keras.models.load_model(modelPath, custom_objects={'r_square': r_square})

# 测试模型
y_pred = model.predict(trainX, batch_size=5000)



In [34]:
print(y_pred.shape)
# 数据量是否完整
'''
也就是每条线路前(filterSize - 1)不包含在预测值中
'''
print('数据量是否完整', y_pred.shape[0] + len(rawDataPiovt) * (filterSize - 1) == rawData.shape[0])

(761574, 1)
数据量是否完整 True


In [16]:
trainX.shape

(761574, 4, 4)

In [23]:
trainX[1]

array([[1.        , 8.        , 1.        , 1.09861229],
       [1.        , 8.        , 2.        , 1.09861229],
       [1.        , 8.        , 3.        , 1.09861229],
       [1.        , 8.        , 4.        , 1.09861229]])