In [11]:
import pandas as pd
import numpy as np
from dateutil.parser import parse
import matplotlib as mpl
from matplotlib.pyplot import MultipleLocator
import matplotlib.pyplot as plt
import statsmodels.tsa.stattools as ts
import statsmodels.api as sm
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from datetime import datetime
import time
import random
from scipy.interpolate import interp1d
import statsmodels.api as sm
import seaborn as sns
import itertools
from statsmodels.tsa.stattools import adfuller
from numpy import mean
import math
import os
from statsmodels.graphics.api import qqplot
from statsmodels.stats.stattools import durbin_watson #DW检验
from pyramid import auto_arima

ModuleNotFoundError: No module named 'pyramid'

In [2]:
# 绘图plt设置中文和负号正常显示
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.rcParams['axes.unicode_minus'] = False

In [3]:
def get_data_dir(dataname):
    father_dir = os.path.abspath('..')
    data_dir = father_dir + '\data'+f'\{dataname}'
    return data_dir

In [4]:
def read_file(filename):
    temp = filename.split(".")
    if temp[1] == "xlsx" or temp[1] == "xls":
        return pd.read_excel(get_data_dir(filename), index_col=0)
    elif temp[1] == "csv":
        return pd.read_csv(get_data_dir(filename), index_col=0)
    else:
        return "文件不存在或格式不符"

In [5]:
def plot_df(df, x, y, title="", xlabel='Index', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:blue')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

In [6]:
def knn_mean(ts, n):
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            n_by_2 = np.ceil(n / 2)
            lower = np.max([0, int(i - n_by_2)])
            upper = np.min([len(ts) + 1, int(i + n_by_2)])
            ts_near = np.concatenate([ts[lower:i], ts[i:upper]])
            out[i] = np.nanmean(ts_near)
    return out

In [7]:
def plot_freq(df,KPI,num):
    df_freq = pd.DataFrame(index=df[KPI][0:num].index)
    df_freq['天数'] = [f'第{int(d / 24) + 1}天' for d in df[KPI][0:num].index]
    df_freq['小时'] = [f'{(d) % 24 + 1}' for d in df[KPI][0:2207].index]
    df_freq['values'] = df[KPI][0:num]

    # 绘图
    week_num = df_freq['天数'].unique()
    np.random.seed(100)
    mycolors = np.random.choice(list(mpl.colors.XKCD_COLORS.keys()), len(week_num), replace=False)
    plt.figure(figsize=(16, 12), dpi=80)
    for i, y in enumerate(week_num[0:-1]):
        plt.plot('小时', 'values', data=df_freq[df_freq["天数"] == y], color=mycolors[i], label=y)
    plt.gca().set(xlim=(0, 23), ylim=(70, 95), ylabel='$Memory Load$', xlabel='$Hour$')
    plt.yticks(fontsize=12, alpha=.7)
    plt.show()

In [8]:
def ADF_is_diff(adf):
    P_value = adf[0]
    p1 = adf[4]['1%']
    p5 = adf[4]['5%']
    p10 = adf[4]['10%']
    if P_value < p1 and P_value < p5 and P_value < p10:
        return False
    else:
        return True

In [9]:
def plot_BIC(timeSeries,p_min, p_max, q_min, q_max, d_min, d_max):
    results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min, p_max + 1)],
                               columns=['MA{}'.format(i) for i in range(q_min, q_max + 1)])
    num = 1
    for p, d, q in itertools.product(range(p_min, p_max + 1),
                                     range(d_min, d_max + 1),
                                     range(q_min, q_max + 1)):
        if p == 0 and d == 0 and q == 0:
            results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = np.nan
            continue

        try:
            model = sm.tsa.ARIMA(timeSeries, order=(p, d, q), #enforce_stationarity=False,enforce_invertibility=False,
                                 )
            results = model.fit()
            num += 1
            print(f"循环{num}次")
            results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = results.bic
        except:
            continue
    results_bic = results_bic[results_bic.columns].astype(float)
    return results_bic

In [10]:
def evaluate(truth, predict, n=None, p=None):
    error = []
    squaredError = []   # 差值平方
    absError = []       # 绝对误差
    truthDeviation = []     # 真实值与均值的差平方
    percentError = []       # 误差百分比
    for i in range(len(truth)):
        error.append(truth[i]-predict[i])
        percentError.append(abs(truth[i]-predict[i])/truth[i])
    for val in error:
        squaredError.append(val**2)
        absError.append(abs(val))
    truthMean = sum(truth) / len(truth)
    for val in truth:
        truthDeviation.append((val - truthMean) ** 2)

    MSE = sum(squaredError)/len(squaredError)   # 均方误差MSE
    RMSE = math.sqrt(MSE)                       # 均方根误差RMSE
    MAE = sum(absError)/len(absError)           # 平均绝对误差MAE
    R2 = 1 - sum(squaredError)/sum(truthDeviation)  # 决定系数
    MAPE = sum(percentError)/len(percentError)  # 平均百分比误差MAPE

    R2_adj = 1 - ((1-R2)(n-1))/(n-p-1)  # 矫正决定系数
    return MSE,RMSE,MAE,MAPE,R2,R2_adj