In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(1379)
import random

import pandas as pd
import os, time
import json

from sklearn.metrics.pairwise import paired_euclidean_distances

from src.clasp import *

In [None]:
# load datasets
def load_floss_dataset(n_change_points=2):
    desc_filename = "datasets/FLOSS/desc.txt"
    desc_file = np.genfromtxt(fname=desc_filename, delimiter=',', filling_values=[None], dtype=None, encoding='utf8')

    df = []

    for ts_name, window_size, floss_score, cp_1, cp_2 in desc_file:
        if n_change_points == 1 and cp_2 != -1: continue

        change_points = [cp_1]
        if cp_2 != -1: change_points.append(cp_2)

        ts = np.loadtxt(fname=os.path.join('datasets/FLOSS/', ts_name + '.txt'), dtype=np.float64)
        df.append((ts_name, window_size, np.array(change_points), ts))

    return pd.DataFrame.from_records(df, columns=["name", "window_size", "change points", "time_series"])


def load_ucrcp_dataset():
    desc_filename = "datasets/UCRCP/desc.txt"
    desc_file = []

    with open(desc_filename, 'r') as file:
        for line in file.readlines(): desc_file.append(line.split(","))

    df = []

    for row in desc_file:
        (ts_name, window_size), change_points = row[:2], row[2:]

        ts = np.loadtxt(fname=os.path.join('datasets/UCRCP/', ts_name + '.txt'), dtype=np.float64)
        df.append((ts_name, int(window_size), np.array([int(_) for _ in change_points]), ts))

    return pd.DataFrame.from_records(df, columns=["name", "window_size", "change points", "time_series"])


def load_combined_dataset():
    return pd.concat([load_floss_dataset(), load_ucrcp_dataset()])

In [None]:
def floss_score(cps_true, cps_pred, ts_len):
    assert len(cps_true) == len(cps_pred), "true/predicted cps must have the same length."
    differences = 0

    for cp_pred in cps_pred:
        distances = paired_euclidean_distances(
            np.array([cp_pred]*len(cps_true)).reshape(-1,1),
            cps_true.reshape(-1,1)
        )
        cp_true_idx = np.argmin(distances, axis=0)
        cp_true = cps_true[cp_true_idx]
        differences += np.abs(cp_pred-cp_true)

    return np.round(differences / ts_len, 6)

In [None]:
if __name__ == '__main__':
    df_comb = load_combined_dataset()

    for idx, (name, window_size, cps, ts) in df_comb.iterrows():
        if idx <1 :
            runtime = time.process_time()
            _, found_cps, _ = extract_clasp_cps(ts, window_size, len(cps))
            runtime = np.round(time.process_time() - runtime, 6)

            score = floss_score(cps, found_cps, ts.shape[0])
            print(f"Time Series: {name}: True Change Points: {cps}, Found Change Points: {found_cps}, Score: {score}, Runtime: {runtime}")

## Compute rolling distribution of distance for each window W_i to the last window W (to all left windows in TS)

In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(1379)
import random

import pandas as pd
import os, time
import json

from sklearn.metrics.pairwise import paired_euclidean_distances

from src.clasp import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def load_TCPD_dataset():
    """ Load a TCPDBench dataset """
    desc_filename = "datasets/TCPD/desc.txt"
    desc_file = []

    with open(desc_filename, 'r') as file:
        for line in file.readlines(): desc_file.append(line.split(","))

    df = []

    for row in desc_file:
        (ts_name, window_size), change_points = row[:2], row[2:]

        #ts = np.loadtxt(fname=os.path.join('datasets/UCRCP/', ts_name + '.txt'), dtype=np.float64)
        filename = os.path.join('datasets/TCPD/', ts_name + '.json')
        with open(filename, "r") as fp:
            data = json.load(fp)

        if data["time"]["index"] != list(range(0, data["n_obs"])):
            raise NotImplementedError(
                "Time series with non-consecutive time axis are not yet supported."
            )

        mat = np.zeros((data["n_obs"], data["n_dim"]))
        for j, series in enumerate(data["series"]):
            mat[:, j] = series["raw"]
        

        # We normalize to avoid numerical errors.
        mat = (mat - np.nanmean(mat)) / np.sqrt(np.nanvar(mat))
        flatten_mat = [e for sublist in mat for e in sublist]
        
        df.append((ts_name, int(window_size), np.array([int(_) for _ in change_points]), flatten_mat))

    return pd.DataFrame.from_records(df, columns=["name", "window_size", "change points", "time_series"])

df = load_TCPD_dataset()
df_tcpd = df.iloc[0]['time_series']

In [12]:
# the sliding windows for a time series and a window size
def sliding_window(ts, window_size):
    shape = ts.shape[:-1] + (ts.shape[-1] - window_size + 1, window_size)
    strides = ts.strides + (ts.strides[-1],)
    return np.lib.stride_tricks.as_strided(ts, shape=shape, strides=strides)
ts = np.array([5, 6, 10, 4, 7, 1, 19])
windows = sliding_window(ts, 2)
windows

array([[ 5,  6],
       [ 6, 10],
       [10,  4],
       [ 4,  7],
       [ 7,  1],
       [ 1, 19]])

In [4]:
def sliding_distance(W, Left):
    """
    W: current window in TS
    Left: Left part of W in TS
    return: distance between current window and each previous window
    """
    m = len(W)
    n = len(Left)
    prev_W = np.empty((len(Left) - m, m))
    # calculte rolling distrubution 
    for i in range(m):
        prev_W[:, i] = Left[i:i-m]
    return np.linalg.norm(W - prev_W, axis=1)


In [5]:
def compute_dist_to_all_window_to_the_left(TS, m):
    """
    TS: time series data
    m: sliding window size
    """
    # L - the number of time points in TS
    L = len(TS) - m + 1
    print('Length of TS: ', L)
    # The distance matrix with size LxL
    distance_arr = [] # array contatining distances
    for i in range(L):
        if i < m:
            continue
        # Current window (aka segment) - the last window maintaining
        W = TS[i:i+m]
        # Compute distance to every windows to its left, notice the index TS[:i]
        distance_arr.append(sliding_distance(W, TS[:i]))
    return distance_arr

In [7]:
def find_k_smallest(ts,k):
    res = sorted(ts)
    index = sorted(range(len(ts)), key = lambda sub: arr[sub])[:k]
    return res[:k],index
ts = [5, 6, 10, 4, 7, 1, 19]
k = 3
res,index = find_k_smallest(ts,k)
print('k smallest elements:', res)
print('index:', index)

k smallest elements: [1, 4, 5]
index: [5, 3, 0]


In [8]:
def k_nearest_neighbors(TS, i, m, k):
    """
    TS: the timeseries
    i:  current window index
    m: window size
    k: number of nearest windows
    return: an array consist of k nearest neighbors of window_i
    """
    W = TS[i:i+m]
    print('W:',W)
    Left = TS[:i]
    print('Left:',Left)
    distance_W = sliding_distance(W, Left)
    print('distance_W:', distance_W)
    dist, index = find_k_smallest(distance_W, 5)
    print('k smallest elements:', dist)
    print('index:', index)
    NNs = {i: j for i, j in zip(index, dist)}
    return NNs
TS = [0, 5, 1, 4, 4, 4, 6, 10, -2, 3]
NNs = k_nearest_neighbors(TS, 6, 1, 2)
NNs

W: [6]
Left: [0, 5, 1, 4, 4, 4]
distance_W: [6. 1. 5. 2. 2.]
k smallest elements: [1.0, 2.0, 2.0, 5.0, 6.0]
index: [1, 3, 4, 2, 0]


{1: 1.0, 3: 2.0, 4: 2.0, 2: 5.0, 0: 6.0}

In [9]:
def largest_gap(NNs):
    """
    NNs: a list of index of k nearest windows to w_i
    return: the largest consecutive gap
    """
    distances = list(NNs.values())
    gaps = []
    for i in range(len(distances)-1):
        gaps.append(abs(distances[i+1]-distances[i])) # try to avoid accessing by index in Python
        
    max_gap = max(gaps)
    return max_gap
print("max_gap:", largest_gap(NNs))

max_gap 3.0


In [None]:
def largest_gap_all_left_windows(TS, n_cps, k, window_size):
    """
    TS: time series data
    n_cps: the number of change points
    k: k nearst neighbors
    window_size: window size to slide windows
    return: an array of largest gap for each window in TS
    """
    # find KNNs for each window W based on distance_arr
    
    # calculate largest gap between KNNs and W
    # find the n_cps largest gap from all windows
    # Show the largest gap as a chart (like in clasp local maximum)




## Load real dataset

In [None]:
floss_data = load_floss_dataset()

In [None]:
floss_data.head()

In [None]:
Powerdemand = floss_data[floss_data.name == 'Powerdemand']

In [None]:
# test with Powerdemand dataset for pilot experiment
Powerdemand

In [None]:
TS = Powerdemand['time_series'].values[0]
print('Series data:', TS)
w = Powerdemand.iloc[0]['window_size']
print('Window size:', w)
dist_arr = compute_dist_to_all_window_to_the_left(TS, w)

## Visualize the distribution of distance

In [None]:
def visualize_as_histogram(ax, data, bins):
    # data: a list of number
    # bins: number of bins in histogram
    ax.hist(data, bins=bins)

In [None]:
def select_a_random_window(ts, cp):
    x = random.choice(range(len(dist_arr)))
    while x >= cp[0] and x <= cp[-1]:
        x = random.choice(range(len(dist_arr)))
    return ts[x]

In [None]:
# visulize windows where change points are located
change_point_window1 = dist_arr[4498]
change_point_window2 = dist_arr[4499]
change_point_window3 = dist_arr[4500]
change_point_window4 = dist_arr[4501]
change_point_window5 = dist_arr[4502]

In [None]:
fig, axes = plt.subplots(ncols=5, figsize=(30, 10))

visualize_as_histogram(axes[0], change_point_window1, 100)
visualize_as_histogram(axes[1], change_point_window2, 100)
visualize_as_histogram(axes[2], change_point_window3, 100)
visualize_as_histogram(axes[3], change_point_window4, 100)
visualize_as_histogram(axes[4], change_point_window5, 100)

plt.show()

In [None]:
cps = [4498, 4499, 4500, 4501, 4502]
fig, axes = plt.subplots(ncols=5, nrows=3, figsize=(30, 30))

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[0][0], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[0][1], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[0][2], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[0][3], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[0][4], w, 100)

########################################
w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[1][0], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[1][1], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[1][2], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[1][3], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[1][4], w, 100)

#################################3

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[2][0], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[2][1], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[2][2], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[2][3], w, 100)

w = select_a_random_window(dist_arr, cps)
visualize_as_histogram(axes[2][4], w, 100)

plt.show()