**<font size=5>使用Keogh_Data数据集测试K均距异常检测方法</font>**

**<font size=4>1. 生成数据集</font>**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
%matplotlib

Using matplotlib backend: Qt5Agg


In [3]:
np.random.seed(0)

outliermode函数用于生成异常模式

In [4]:
def outliermode(N, tarray):
    array = np.zeros_like(tarray)
    earray = np.sin(75 * np.pi * tarray / N) - np.sin(50 * np.pi * tarray / N)
    array[400:433] = earray[400:433]
    return array

np.random.normal(loc=0, scale=0.1, size=(len(tarray)))用于生成一个均值为0，标准差为0.1的加性高斯噪声

In [5]:
def createData(N, tarray):
    noise = np.random.normal(loc=0, scale=0.1, size=(len(tarray)))
    y1 = np.sin(50 * np.pi * tarray / N) + noise
    y2 = np.sin(50 * np.pi * tarray / N) + noise + outliermode(N, tarray)
    return y1, y2

In [6]:
N = 800

In [7]:
t = np.arange(0, N)

In [8]:
y1, y2 = createData(N, t)

绘制无异常值的数据和有异常值的数据的图像

In [9]:
def drawInitialData(t, y1, y2):
    plt.plot(t, y1, color="black", label="without outliers", alpha=0.7)
    plt.plot(t, y2, "b--", label="with outliers")
    plt.legend()

In [10]:
drawInitialData(t, y1, y2)

可以发现，中间[400， 432]部分的数据是含有异常值的

**<font size=4>K均距异常因子检测</font>**

**<font size=4>1. 计算边缘权重因子</font>**

**<font size=3 color="blue">用于确定子时间序列的边界点</font>**

In [11]:
#假设将检测窗口的宽度设置为7（即一周）
w = 9
k = 10

定义一个ndarray，存储下该数据在滑动窗口中成为极值的次数

In [12]:
countnum = np.zeros_like(y2)

In [13]:
m = countnum.shape[0]
m

800

In [14]:
windownum = int(np.ceil(m / w))

In [15]:
windownum

89

<font color="red">通常，把收尾两个数据点，直接视为边界点，这里我们没有对首尾边界点进行处理</font>

In [16]:
for i in range(windownum-1):
    windowarray = y2[i*w:(i+1)*w]
    wmin = windowarray.min()
    wmax = windowarray.max()
    for j in range(w):
        valueindex = i * w + j
        value = y2[valueindex]
        num = 0
        if abs(wmin-value) < 1e-9: 
            num -= 1
        elif abs(wmax-value) < 1e-9:
            num += 1
        countnum[valueindex] = abs(num)
        

In [17]:
countnum[0] = 1

In [18]:
countnum[m-1] = 1

In [19]:
countnum

array([1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1.

找到w=9为一个单位，找出其中的最大值的下标

In [20]:
indexarray = []

In [21]:
for i in range(m):
    if countnum[i] == 1:
        indexarray.append(i)

In [22]:
indexarray

[0,
 6,
 9,
 17,
 18,
 25,
 27,
 35,
 43,
 44,
 45,
 53,
 56,
 62,
 63,
 70,
 72,
 80,
 81,
 88,
 90,
 98,
 99,
 105,
 108,
 116,
 121,
 125,
 126,
 134,
 136,
 143,
 144,
 152,
 153,
 161,
 162,
 168,
 171,
 179,
 180,
 183,
 190,
 197,
 198,
 206,
 207,
 215,
 216,
 223,
 225,
 233,
 235,
 242,
 243,
 249,
 252,
 259,
 265,
 269,
 270,
 278,
 280,
 287,
 288,
 295,
 297,
 305,
 306,
 312,
 315,
 323,
 324,
 327,
 333,
 341,
 343,
 350,
 351,
 359,
 360,
 368,
 369,
 377,
 378,
 386,
 387,
 390,
 396,
 404,
 405,
 413,
 418,
 422,
 424,
 428,
 432,
 440,
 441,
 449,
 450,
 455,
 459,
 467,
 470,
 476,
 477,
 484,
 486,
 494,
 495,
 503,
 506,
 511,
 513,
 519,
 522,
 530,
 531,
 534,
 540,
 548,
 552,
 556,
 558,
 566,
 571,
 575,
 576,
 583,
 585,
 593,
 594,
 600,
 604,
 611,
 618,
 619,
 621,
 629,
 633,
 638,
 639,
 646,
 649,
 656,
 657,
 665,
 666,
 674,
 675,
 681,
 684,
 692,
 697,
 701,
 702,
 710,
 714,
 719,
 720,
 726,
 730,
 737,
 738,
 742,
 747,
 755,
 759,
 764,
 765,


**<font size=4>2. 计算子序列的特征</font>**

子序列特征主要包括：序列长度，序列高度，序列均值，序列标准差

**序列长度**:in+1 - in + 1<br>
**序列高度**:Xn+1 - Xn两个边缘点的值之差<br>
**序列均值**:mean
**序列方差**:std

In [23]:
featurelist = []

In [24]:
for i in range(len(indexarray)-1):
    start = indexarray[i]
    end = indexarray[i+1]
    subsequence = y2[start:end+1]
    sequencelength = end - start + 1
    sequenceheight = abs(y2[end] -  y2[start])
    seqencemean = subsequence.mean()
    seqencestd = subsequence.std()
    featurelist.append([sequencelength, sequenceheight, seqencemean, seqencestd])

In [25]:
features = np.array(featurelist)

In [26]:
features.shape

(176, 4)

**<font size=4>3. 标准化特征</font>**

先尝试使用MinMaxScaler对数据进行标准化

In [27]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
scaler = MinMaxScaler()

In [29]:
features_scaled = scaler.fit_transform(features)

**<font size=4>4. 相关函数定义</font>**

计算任意两点之间的距离

In [30]:
def distance(p,q):
    dist = 0
    for i in range(4):
        dist += (p[i] - q[i]) ** 2
    np.sqrt(dist)
    return dist

In [31]:
def K_dist(data, k, seqindex):
    m = data.shape[0]
    
    p = data[seqindex]
    distanceslist = []
    
    for i in range(m):
        if i == seqindex:
            continue
        q = data[i]
        distanceslist.append(distance(p,q))

    distarray = np.array(distanceslist).reshape(-1,1)
    scaler = MinMaxScaler()
    distarrayscaled = scaler.fit_transform(distarray)
    distarrayscaled = distarrayscaled.reshape(1,-1).ravel()

    sortIndex = distarrayscaled.argsort()
    pmax = distarrayscaled[sortIndex[m-2]]
    sumkdist = 0 
    
    for j in range(k):
        jth = distarray[sortIndex[j]]
        sumkdist += jth
     
    meankdist = sumkdist / k
        
    return meankdist + pmax
    

ndarray.argsort()返回ndarray数组中从小到大排序的下标

查看每个序列的KMDOF，K均距异常因子

In [32]:
#seqnum为序列数
seqnum = features_scaled.shape[0]

In [33]:
seqnum

176

In [34]:
KMDOF = []
for i in range(seqnum):
    kmd = K_dist(features_scaled, k, i)
    KMDOF.append(kmd)

In [35]:
KMDOFarray = np.array(KMDOF).ravel()

In [36]:
KMDOFarray

array([1.03153495, 1.02672099, 1.0118729 , 1.01107739, 1.01903868,
       1.02153603, 1.01150338, 1.06603634, 1.04185611, 1.01496001,
       1.01101824, 1.0373586 , 1.01944619, 1.00879511, 1.01691098,
       1.01466771, 1.01413333, 1.02377861, 1.02580024, 1.01904792,
       1.00806507, 1.01184058, 1.02979259, 1.0186508 , 1.01524705,
       1.02069919, 1.02026691, 1.00744056, 1.01029532, 1.01513917,
       1.0314864 , 1.02485115, 1.0163576 , 1.03039103, 1.01639018,
       1.01313004, 1.02390775, 1.01577038, 1.01041816, 1.01377174,
       1.02227536, 1.02099339, 1.01644702, 1.0316598 , 1.0546195 ,
       1.01822663, 1.01247092, 1.02481938, 1.0200676 , 1.03158672,
       1.05082983, 1.01521114, 1.01707981, 1.00957075, 1.01984201,
       1.01962741, 1.02356746, 1.0346419 , 1.02442292, 1.01104329,
       1.01994857, 1.02910631, 1.02290517, 1.01683196, 1.02232553,
       1.01849905, 1.01003933, 1.01590976, 1.01878283, 1.01954572,
       1.00942494, 1.0126826 , 1.03253693, 1.02718763, 1.01127

In [37]:
KMDOFarray.argsort()

array([ 27,  20, 148,  84,  77, 117,  13, 102, 138,  70, 156,  53, 130,
        66, 152, 116,  28,  85,  38, 170, 166,  10,  59,   3, 105,  78,
        74, 149,   6, 120,  21,   2,  98,  46,  71, 127,  35, 141, 124,
        39, 155,  16, 129, 131,  15,   9,  89,  29,  51,  24, 143, 110,
        37,  99,  67,  81,  32,  34,  42, 171,  63, 147,  14, 159,  52,
       128, 169, 106,  45,  65,  23,  80,  68, 151,  76,   4,  19, 104,
       121,  12,  69,  55, 115, 144,  54,  60,  48, 154,  75,  26,  25,
       134, 133, 168,  41,  83, 145, 101,   5,  40,  64,  62, 137, 142,
       103, 174, 157,  56,  17,  97,  36, 163,  93, 162,  58,  47,  31,
       139, 132,  18, 123,   1, 135,  73,  61, 113,  82,  22, 172,  33,
        86, 150,  30,   0,  49,  43, 122, 146, 167,  72, 119, 140, 173,
       107, 153, 161,  88,  57, 111, 118,  11, 160, 164, 114, 125,   8,
        90, 136, 100,  79, 158, 165,  50, 109,  44,  87, 126, 108,   7,
       112, 175,  94,  95,  92,  91,  96], dtype=int64)

In [38]:
KMDOFarray[96]

1.184152837009642

In [39]:
KMDOFarray[91]

1.1521228545798017

In [40]:
KMDOFarray[175]

1.0723552706659585

In [41]:
startOutlier = indexarray[90]

In [42]:
endOutlier = indexarray[96]

In [43]:
outlierseq = y2[startOutlier:endOutlier+1]

In [44]:
plt.plot(t[startOutlier:endOutlier+1], outlierseq, color="red", linewidth=5, alpha=0.6, label="outlier sequence")

[<matplotlib.lines.Line2D at 0x11306cf8>]

In [45]:
plt.legend()

<matplotlib.legend.Legend at 0x15d999b0>

In [46]:
plt.xlabel("t")
plt.title("Outlier sequence detection")

Text(0.5,1,'Outlier sequence detection')