In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib

Using matplotlib backend: Qt5Agg


**<font size=5>对每天的日均值利用K均距异常因子检测方法找到一种序列分割的方式</font>**

**<font size=4>Ⅰ对每天的每小时的数据求均值，将每天的均值作为该日的O3</font>**

<font size=4>1. 读取数据</font>

In [3]:
filename = "2015O3data.csv"
data = pd.read_csv(filename, header = 0, encoding="utf-8")

<font size=4>2. 查看数据</font>

In [4]:
data.dtypes

date           int64
hour           int64
O3           float64
O3_24h       float64
O3_8h        float64
O3_8h_24h    float64
dtype: object

In [5]:
data.shape

(8212, 6)

In [6]:
data.count()

date         8212
hour         8212
O3           8212
O3_24h       8212
O3_8h        8212
O3_8h_24h    8212
dtype: int64

<font size=4>3. 计算均值</font>

In [7]:
group = data.groupby(data["date"])

利用groupby按照date每天的日期，计算均值

In [8]:
meanSeries = group.mean()["O3"]

通过group.mean()方法计算出来的均值是Series类型的，需要手动转化为DataFrame类型

In [9]:
meanData = pd.DataFrame(meanSeries, columns=["O3"])

In [10]:
meanData.head()

Unnamed: 0_level_0,O3
date,Unnamed: 1_level_1
20150102,20.916667
20150103,15.333333
20150104,21.833333
20150105,14.409091
20150106,31.090909


索引就是每天的日期数据，存储为datetime，此时是index索引类型，需要手动转换

In [11]:
datetime = meanData.index.values

In [12]:
datetimeDF = pd.DataFrame(datetime, columns=["datetime"])

In [13]:
datetimeDF

Unnamed: 0,datetime
0,20150102
1,20150103
2,20150104
3,20150105
4,20150106
5,20150107
6,20150108
7,20150109
8,20150110
9,20150111


重置原来的meanData索引

<font color="red">为了和刚刚生成的日期数据合并，重置均值数据的索引，重置后的索引从0开始</font>

In [14]:
meanData.reset_index(inplace=True,drop=True)

In [15]:
meanData.head()

Unnamed: 0,O3
0,20.916667
1,15.333333
2,21.833333
3,14.409091
4,31.090909


使用pd.concat()合并datetimeDF和meanData

In [16]:
newdata = pd.concat([datetimeDF, meanData], axis=1, join_axes=[datetimeDF.index])

In [17]:
newdata

Unnamed: 0,datetime,O3
0,20150102,20.916667
1,20150103,15.333333
2,20150104,21.833333
3,20150105,14.409091
4,20150106,31.090909
5,20150107,20.250000
6,20150108,24.428571
7,20150109,32.391304
8,20150110,24.458333
9,20150111,34.043478


<font size=4>4. 时间的处理</font>

In [18]:
newdata["datetime"] = pd.to_datetime(newdata["datetime"], format="%Y%m%d")

In [19]:
newdata["year"] = pd.DatetimeIndex(newdata.datetime).year
newdata["month"] = pd.DatetimeIndex(newdata.datetime).month
newdata["day"] = pd.DatetimeIndex(newdata.datetime).day

In [20]:
newdata = newdata.set_index("datetime")
#存储索引，以便后期绘图
dataIndex = newdata.index

<font size=4>5.调整一下数据的位置</font>

In [21]:
orderlist = ["year", "month", "day", "O3"]

In [22]:
newdata = newdata[orderlist]
newdata.head()

Unnamed: 0_level_0,year,month,day,O3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-02,2015,1,2,20.916667
2015-01-03,2015,1,3,15.333333
2015-01-04,2015,1,4,21.833333
2015-01-05,2015,1,5,14.409091
2015-01-06,2015,1,6,31.090909


<font size=4>6.查看以天为分隔的O3均值数据</font>

In [23]:
plt.figure(figsize=(25,10))
xticks = pd.date_range(start=dataIndex.min(), end = dataIndex.max(), freq="m")
plt.xticks(xticks, xticks.strftime("%Y/%m"), rotation=75, ha="right")
plt.plot(newdata["O3"], linewidth=1, alpha=0.9)
plt.xlabel("Time")
plt.ylabel("scaler quantity")
plt.title("the mean value trend of O3 in 2015")
plt.show()

**<font size=4>ⅡK均距异常因子检测</font>**

**<font size=4>1. 计算边缘权重因子</font>**

**<font size=3 color="blue">用于确定子时间序列的边界点</font>**

In [24]:
import numpy as np

**<font color="red">注意：滑动窗口的宽度和大小表示的含义有所不同</font>**

滑动窗口的宽度为7，表示向后一共滑动7次，每次一步

In [25]:
#假设将检测窗口的宽度设置为7（即一周）
w = 7

滑动窗口的大小为7，表示一个滑动窗口包含7条数据

In [26]:
#滑动窗口的大小为
windowsize=7

In [27]:
datatest = newdata["O3"]

In [28]:
datatest.head()

datetime
2015-01-02    20.916667
2015-01-03    15.333333
2015-01-04    21.833333
2015-01-05    14.409091
2015-01-06    31.090909
Name: O3, dtype: float64

In [29]:
testarray = datatest.values

In [30]:
testarray

array([ 20.91666667,  15.33333333,  21.83333333,  14.40909091,
        31.09090909,  20.25      ,  24.42857143,  32.39130435,
        24.45833333,  34.04347826,  24.55      ,  19.125     ,
        18.91304348,  14.47826087,  35.54545455,  37.29166667,
        46.4       ,  30.91304348,  22.875     ,  23.7826087 ,
        25.20833333,  17.65      ,  12.86956522,  11.41666667,
        13.95454545,  30.125     ,  30.13043478,  21.22727273,
        17.08695652,  21.875     ,  25.29166667,  17.33333333,
        19.34782609,  25.04347826,  20.95652174,  25.56521739,
        32.30434783,  33.91666667,  37.04166667,  41.08333333,
        49.82608696,  29.5       ,  43.2173913 ,  35.65217391,
        40.54166667,  37.16666667,  41.58333333,  49.54166667,
        49.95833333,  38.95833333,  36.08695652,  47.56521739,
        22.95454545,  18.36363636,  33.69565217,  48.5       ,
        37.66666667,  32.25      ,  37.625     ,  26.04347826,
        53.33333333,  46.08695652,  48.79166667,  47.12

定义一个ndarray，存储下该数据在滑动窗口中成为极值的次数

In [31]:
countnum = np.zeros_like(testarray)

In [32]:
countnum

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [33]:
m = countnum.shape[0]
m

364

通常，把收尾两个数据点，直接视为边界点

In [34]:
import math

In [35]:
testarray

array([ 20.91666667,  15.33333333,  21.83333333,  14.40909091,
        31.09090909,  20.25      ,  24.42857143,  32.39130435,
        24.45833333,  34.04347826,  24.55      ,  19.125     ,
        18.91304348,  14.47826087,  35.54545455,  37.29166667,
        46.4       ,  30.91304348,  22.875     ,  23.7826087 ,
        25.20833333,  17.65      ,  12.86956522,  11.41666667,
        13.95454545,  30.125     ,  30.13043478,  21.22727273,
        17.08695652,  21.875     ,  25.29166667,  17.33333333,
        19.34782609,  25.04347826,  20.95652174,  25.56521739,
        32.30434783,  33.91666667,  37.04166667,  41.08333333,
        49.82608696,  29.5       ,  43.2173913 ,  35.65217391,
        40.54166667,  37.16666667,  41.58333333,  49.54166667,
        49.95833333,  38.95833333,  36.08695652,  47.56521739,
        22.95454545,  18.36363636,  33.69565217,  48.5       ,
        37.66666667,  32.25      ,  37.625     ,  26.04347826,
        53.33333333,  46.08695652,  48.79166667,  47.12

In [36]:
for i in range(1,m-w):
    num = 0
    value = testarray[i]
    
    for j in range(w):
        windowarray = testarray[(i+j):(i+j+w)]
        wmin = windowarray.min()
        wmax = windowarray.max()
    
        if abs(wmin-value) < 1: 
            num -= 1
        elif abs(wmax-value) < 1:
            num += 1
    
    countnum[i] = abs(num)

In [37]:
countnum

array([0., 4., 0., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       6., 0., 0., 0., 3., 0., 1., 1., 2., 1., 1., 4., 2., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 0., 7., 1., 0., 3., 0., 0., 0., 2., 1., 0., 0.,
       3., 0., 1., 0., 1., 3., 0., 5., 1., 1., 0., 2., 0., 0., 1., 0., 0.,
       0., 4., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 1., 0., 2., 1., 1., 1., 1., 0., 0., 0., 2., 1., 1.,
       0., 1., 1., 5., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 7., 0., 1., 0., 0., 0.,
       1., 4., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 2., 0., 4., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
       1., 1., 0., 0., 3., 2., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

找到windowsize=7为一个单位，找出其中的最大值的下标

In [38]:
size = int(countnum.shape[0] / w)

In [39]:
indexarray = [0,m-1]

In [40]:
for i in range(size):
    windowcount = countnum[i*w:(i+1)*w]
    maxindex = windowcount.argmax()
    
    if maxindex == 0:
        continue
    
    indexarray.append(i * w + maxindex)

In [41]:
indexarray

[0,
 363,
 1,
 13,
 17,
 40,
 43,
 51,
 58,
 69,
 92,
 100,
 108,
 116,
 122,
 135,
 163,
 171,
 178,
 188,
 193,
 197,
 208,
 215,
 228,
 242,
 246,
 260,
 267,
 276,
 288,
 303,
 311,
 317,
 328,
 330,
 339,
 347,
 355]

In [42]:
indexarray.sort()

In [43]:
indexarray

[0,
 1,
 13,
 17,
 40,
 43,
 51,
 58,
 69,
 92,
 100,
 108,
 116,
 122,
 135,
 163,
 171,
 178,
 188,
 193,
 197,
 208,
 215,
 228,
 242,
 246,
 260,
 267,
 276,
 288,
 303,
 311,
 317,
 328,
 330,
 339,
 347,
 355,
 363]

In [44]:
piece = pd.DataFrame(newdata.iloc[indexarray]["O3"], columns=["O3"])

In [45]:
piece

Unnamed: 0_level_0,O3
datetime,Unnamed: 1_level_1
2015-01-02,20.916667
2015-01-03,15.333333
2015-01-15,14.478261
2015-01-19,30.913043
2015-02-11,49.826087
2015-02-14,35.652174
2015-02-22,47.565217
2015-03-01,37.625
2015-03-12,48.857143
2015-04-04,79.416667


In [46]:
plt.scatter(piece.index, piece.O3, color="red")

<matplotlib.collections.PathCollection at 0x1328c8d0>

其中，图中的红点就是我们寻找的边缘因子，后期会根据边缘因子对原始时间序列进行切分，划分为长度不一的子序列

**<font size=4>2. 计算子序列的特征</font>**

子序列特征主要包括：序列长度，序列高度，序列均值，序列标准差

**序列长度**:in+1 - in + 1<br>
**序列高度**:Xn+1 - Xn两个边缘点的值之差<br>
**序列均值**:mean
**序列方差**:std

In [47]:
featurelist = []

In [48]:
for i in range(len(indexarray)-1):
    start = indexarray[i]
    end = indexarray[i+1]
    subsequence = newdata.iloc[start:end+1]["O3"]
    sequencelength = end - start + 1
    sequenceheight = newdata.iloc[end]["O3"] -  newdata.iloc[start]["O3"]
    seqencemean = subsequence.mean()
    seqencestd = subsequence.std()
    featurelist.append([sequencelength, sequenceheight, seqencemean, seqencestd])

In [49]:
len(featurelist)

38

In [50]:
len(indexarray)

39

39个点将序列分隔成38个子序列

为了后续便于处理，将存储的序列特征的列表转换为ndarray数组

In [51]:
features = np.array(featurelist)

In [52]:
features.shape

(38, 4)

In [53]:
features

array([[  2.        ,  -5.58333333,  18.125     ,   3.94801286],
       [ 13.        ,  -0.85507246,  22.71574295,   6.62684835],
       [  5.        ,  16.43478261,  32.92568511,  11.74564106],
       [ 24.        ,  18.91304348,  25.28435716,   9.19900092],
       [  4.        , -14.17391304,  39.54891304,   8.8551918 ],
       [  9.        ,  11.91304348,  41.89492754,   5.71463321],
       [  8.        ,  -9.94021739,  34.82758976,  10.58991444],
       [ 12.        ,  11.23214286,  42.5075836 ,   7.17846974],
       [ 24.        ,  30.55952381,  47.14065735,  16.82554615],
       [  9.        ,  -8.59848485,  54.80595552,  16.28892186],
       [  9.        ,  -7.90909091,  57.51587049,  18.95789419],
       [  9.        ,  23.70995671,  76.04930255,  17.32939091],
       [  7.        ,   0.38095238,  88.67961335,  24.92628245],
       [ 14.        ,   0.80952381,  74.73149832,  18.25665975],
       [ 29.        ,  51.53830228,  96.39580599,  24.39118687],
       [  9.        , -10

**<font size=4>3. 标准化特征</font>**

先尝试使用MinMaxScaler对数据进行标准化

In [54]:
from sklearn.preprocessing import MinMaxScaler

In [55]:
scaler = MinMaxScaler()

In [56]:
features_scaled = scaler.fit_transform(features)

In [57]:
features_scaled

array([[0.        , 0.53378763, 0.04659817, 0.        ],
       [0.40740741, 0.56612563, 0.09173898, 0.11274632],
       [0.11111111, 0.68437616, 0.19213343, 0.3281851 ],
       [0.81481481, 0.70132573, 0.11699619, 0.22100259],
       [0.07407407, 0.47503407, 0.25725969, 0.20653242],
       [0.25925926, 0.65345063, 0.28032808, 0.07435318],
       [0.22222222, 0.50398959, 0.21083488, 0.2795431 ],
       [0.37037037, 0.64879374, 0.28635233, 0.13596286],
       [0.81481481, 0.78097952, 0.33190938, 0.54198719],
       [0.25925926, 0.51316611, 0.40728233, 0.51940184],
       [0.25925926, 0.51788108, 0.43392895, 0.63173305],
       [0.25925926, 0.73413327, 0.61616835, 0.56319291],
       [0.18518519, 0.57457918, 0.74036231, 0.88292944],
       [0.44444444, 0.57751031, 0.60321037, 0.60221962],
       [1.        , 0.92445971, 0.8162357 , 0.86040844],
       [0.25925926, 0.50209392, 1.        , 0.75143615],
       [0.22222222, 0.        , 0.58947556, 1.        ],
       [0.33333333, 1.        ,

In [58]:
features_scaled[2]

array([0.11111111, 0.68437616, 0.19213343, 0.3281851 ])

**<font size=4>4. 相关函数定义</font>**

定义距离计算函数，这里使用欧式距离

In [59]:
def distance(p,q):
    dist = 0
    for i in range(4):
        dist += (p[i] - q[i]) ** 2
    np.sqrt(dist)
    return dist

In [60]:
def K_dist(data, k, pindex):
    m = data.shape[0]
    p = data[pindex]
    distanceslist = []
    
    for i in range(m):
        if i == pindex:
            continue
        q = data[i]
        distanceslist.append(distance(p,q))
    
    distarray = np.array(distanceslist).reshape(-1,1)
    scaler = MinMaxScaler()
    distarrayscaled = scaler.fit_transform(distarray)
    distarrayscaled = distarrayscaled.reshape(1,-1).ravel()

    sortIndex = distarrayscaled.argsort()
    pmax = distarrayscaled[sortIndex[m-2]]
    sumkdist = 0 
    
    for j in range(k):
        jth = distarrayscaled[sortIndex[j]]
        sumkdist += jth
    
    meankdist = sumkdist / k
        
    return meankdist + pmax


ndarray.argsort()返回ndarray数组中从小到大排序的下标

查看每个序列的KMDOF，K均距异常因子

In [61]:
#seqnum为序列数
seqnum = features_scaled.shape[0]

In [62]:
seqnum

38

In [63]:
k = 7

In [64]:
KMDOF = []
for i in range(seqnum):
    kmd = K_dist(features_scaled, k, i)
    KMDOF.append(kmd)

In [65]:
KMDOFarray = np.array(KMDOF)

In [66]:
KMDOFarray

array([1.02593532, 1.01588708, 1.01564755, 1.06733205, 1.02238062,
       1.03502889, 1.02029904, 1.04897491, 1.07532057, 1.05017999,
       1.04476081, 1.03280899, 1.08295194, 1.05341153, 1.06776188,
       1.07095794, 1.04923122, 1.06329906, 1.06878382, 1.05098001,
       1.03782649, 1.04017839, 1.05510257, 1.02539483, 1.05161087,
       1.0349553 , 1.06908124, 1.04967967, 1.0147834 , 1.15104776,
       1.02105551, 1.01642719, 1.0507631 , 1.01811923, 1.01141321,
       1.07186373, 1.04354825, 1.01458616])

In [67]:
KMDOFarray.argsort()[seqnum-1]

29

## 发现第29个序列最异常

In [68]:
startOutlier = indexarray[28]

In [69]:
endOutlier = indexarray[29]

In [70]:
outlierseq = pd.DataFrame(newdata.iloc[startOutlier:endOutlier+1]["O3"], columns=["O3"])

In [71]:
plt.plot(outlierseq)

[<matplotlib.lines.Line2D at 0x16379f28>]