In [61]:
from sklearn.preprocessing import scale
from sklearn.metrics import pairwise
import numpy as np
import ot
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm


In [62]:
def subsequences(time_series, k):
    time_series = np.asarray(time_series)
    n = time_series.size
    shape = (n - k + 1, k)
    strides = time_series.strides * 2

    return np.lib.stride_tricks.as_strided(
        time_series,
        shape=shape,
        strides=strides
    )

def subsequences_multivariate(time_series, k):
    time_series = np.asarray(time_series)
    n = len(time_series)
    shape = (n - k + 1, k) + time_series.shape[1:]
    if time_series.ndim == 1:
        strides = time_series.strides * 2
    elif time_series.ndim == 2:
        strides = time_series.strides[0], time_series.strides[0], time_series.strides[1]
    return np.lib.stride_tricks.as_strided(time_series, shape=shape, strides=strides)

In [63]:
def wasserstein_kernel(subsequences_1, subsequences_2, metric='euclidean'):
    '''
    Calculates the distance between two time series using their
    corresponding set of subsequences. The metric used to align
    them may be optionally changed.
    '''

    C = ot.dist(subsequences_1, subsequences_2, metric=metric)
    return ot.emd2([], [], C)

def wasserstein_kernel_multivariate(subsequences_1, subsequences_2, metric='euclidean'):
    '''
    Calculates the distance between two time series using their
    corresponding set of subsequences. The metric used to align
    them may be optionally changed.
    '''
    sub_length = int(np.floor(min(len(xs), len(xt)) * sub_ratio))

    C = ot.dist(subsequences_1, subsequences_2, metric=metric)
    return ot.emd2([], [], C)

In [146]:
def checkEven(x, y):
    rows_diff = x.shape[0] - y.shape[0]
    columns_diff = x.shape[1] - y.shape[1]
    if rows_diff > 0:
        zero_rows = np.zeros((rows_diff, y.shape[1]))
        y = np.vstack((y, zero_rows))
    elif rows_diff < 0:
        zero_rows = np.zeros((-rows_diff, x.shape[1]))
        x = np.vstack((x, zero_rows))
    if columns_diff > 0:
        zero_columns = np.zeros((y.shape[0], columns_diff))
        y = np.hstack((y, zero_columns))
    elif columns_diff < 0:
        zero_columns = np.zeros((x.shape[0], -columns_diff))
        x = np.hstack((x, zero_columns))
    return x, y
def cost_matrix_aw(x, y, subsequence=True):
    if subsequence == True:
        x = np.array(x).reshape(np.array(x).shape[0], -1)
        y = np.array(y).reshape(np.array(y).shape[0], -1)
        C = ot.dist(x, y, metric="euclidean", p=2)
        return C
    else:
        x = np.array(x).reshape(np.array(x).shape[0], -1)
        y = np.array(y).reshape(np.array(y).shape[0], -1)
        C = ot.dist(x, y, metric="euclidean", p=2)
        return C

In [147]:
def wtk(xs, xt, sub_length, normalized=False):
    s_i = subsequences(xs, sub_length)
    s_j = subsequences(xt, sub_length)
    if normalized:
        s_i = scale(s_i, axis=1)
        s_i = scale(s_j, axis=1)
    dist = wasserstein_kernel(s_i, s_j)
    return dist

def wtk_multivariate(xs, xt, sub_ratio=0.1, normalized=False):
    sub_length = int(np.floor(min(len(xs), len(xt)) * sub_ratio))
    subs_xs = subsequences_multivariate(xs, sub_length)
    subs_xt = subsequences_multivariate(xt, sub_length)
    if normalized:
        subs_xs = scale(subs_xs, axis=1)
        subs_xt = scale(subs_xt, axis=1)
    p = np.ones(len(subs_xs)) / len(subs_xs)
    q = np.ones(len(subs_xt)) / len(subs_xt)
    C = cost_matrix_aw(subs_xs, subs_xt)
    T, logemd = ot.emd(p, q, C, log=True)
    return np.sum(T * C)

In [148]:
def knn_wtk_short(X_train, X_test, y_train, y_test, sub_length=10, k=1, normalized=False):
    clf = KNeighborsClassifier(
        n_neighbors=k,
        metric=wtk,
        metric_params={"sub_length": sub_length, "normalized": normalized},
        n_jobs=-1,
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [149]:
def knn_classifier_from_distance_matrix(distance_matrix, k, labels):
    knn_clf = KNeighborsClassifier(
        n_neighbors=k, algorithm="brute", metric="precomputed"
    )
    n_train_samples = distance_matrix.shape[1]
    knn_clf.fit(np.random.rand(n_train_samples, n_train_samples), labels)
    predicted_labels = knn_clf.predict(distance_matrix)
    return predicted_labels
def knn_wtk(X_train, X_test, y_train, y_test, sub_ratio=0.1, normalized=False, k=1):
    train_size = len(X_train)
    test_size = len(X_test)
    result = np.zeros((test_size, train_size))
    for train_idx in tqdm(range(train_size)):
        for test_idx in tqdm(range(test_size), leave=False):
            distance = wtk_multivariate(np.array(X_train[train_idx]), np.array(X_test[test_idx]), sub_ratio=sub_ratio, normalized=normalized)
            result[test_idx, train_idx] = distance

    y_pred = knn_classifier_from_distance_matrix(
        distance_matrix=result,
        k=k,
        labels=y_train,
    )
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [152]:
from GetData.GetDataMultiDimensions import get_test_data, get_train_data, getDataMultiVariate
from GetData.GetDataOneDimension import getData
import os

X_train, y_train, X_test, y_test = getData('ItalyPowerDemand', '../Data/OneDimension/')
len(X_train[0]), len(X_test), len(X_train_arabic), len(X_test_arabic)

(24, 1029, 6600, 2200)

In [159]:
X_train_arabic, X_test_arabic, y_train_arabic, y_test_arabic = getDataMultiVariate("ArabicCut", data_path=f"../Data/MultiDimensions/ArabicCut")

In [161]:
# Arabic Cut
knn_wtk(X_train=X_train_arabic, X_test=X_test_arabic, y_train=y_train_arabic, y_test=y_test_arabic, sub_ratio=0.6)

100%|██████████| 330/330 [00:17<00:00, 18.86it/s]


0.4818181818181818

In [158]:
# Arabic Cut
knn_wtk(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, sub_ratio=0.2)

 64%|██████▍   | 43/67 [00:17<00:09,  2.44it/s]


KeyboardInterrupt: 

In [72]:
# ItalyPowerDemand
knn_wtk_short(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, sub_length=12)

0.9523809523809523

In [105]:
# ItalyPowerDemand
knn_wtk(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, sub_ratio=0.5)

100%|██████████| 67/67 [00:37<00:00,  1.81it/s]


0.9523809523809523

In [70]:
# ItalyPowerDemand
knn_wtk(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, sub_ratio=0.5)

  0%|          | 0/67 [00:00<?, ?it/s]

100%|██████████| 67/67 [00:53<00:00,  1.25it/s]


0.9523809523809523

In [53]:
a = [1,2,3,4,5,6,7,8,9,10]

In [76]:
len(X_train_arabic[0])

37

In [82]:
a = subsequences_multivariate(np.array(X_train_arabic[0]), 4)

In [83]:
len(a)

34

In [86]:
X_train_arabic[0].shape

(37, 13)

In [94]:
def check(a, b):
    print(a.shape, b.shape)
    x_tr = a.reshape(a.shape[0], -1)
    x_te = b.reshape(b.shape[0], -1)
    M = ot.dist(x_tr, x_te, metric="euclidean")
    print(x_tr.shape, x_te.shape)

In [95]:
check(X_train_arabic[0], X_test_arabic[0])

(37, 13) (27, 13)
(37, 13) (27, 13)


In [132]:
m = subsequences_multivariate(X_train_arabic[0], 10)
m.shape, X_train_arabic[0].shape

((28, 10, 13), (37, 13))

In [131]:
n = subsequences_multivariate(X_train[0], 10)
n.shape, X_train[0].shape

((15, 10), (24,))

In [134]:
x = np.array(n).reshape(np.array(n).shape[0], -1)
y = np.array(m).reshape(np.array(m).shape[0], -1)
x.shape, y.shape

((15, 10), (28, 130))

In [130]:
a = np.array([[1,2],[3,4],[5,6],[7,8],[9,10]])
b = np.array([[2,3],[4,5],[6,7],[8,9],[10,11], [11, 12]])
s = ot.dist(a,b, metric="euclidean", p=2)
s

array([[ 1.41421356,  4.24264069,  7.07106781,  9.89949494, 12.72792206,
        14.14213562],
       [ 1.41421356,  1.41421356,  4.24264069,  7.07106781,  9.89949494,
        11.3137085 ],
       [ 4.24264069,  1.41421356,  1.41421356,  4.24264069,  7.07106781,
         8.48528137],
       [ 7.07106781,  4.24264069,  1.41421356,  1.41421356,  4.24264069,
         5.65685425],
       [ 9.89949494,  7.07106781,  4.24264069,  1.41421356,  1.41421356,
         2.82842712]])

In [142]:
xs, xt = X_train[0], X_test[0]
xs = subsequences_multivariate(xs, 10)
xt = subsequences_multivariate(xt, 10)
xs.shape, xt.shape, X_train[0].shape, X_test[0].shape

((15, 10), (15, 10), (24,), (24,))

In [138]:
a = cost_matrix_aw(xs, xt)
a.shape, a

((15, 15),
 array([[2.92422471, 1.52494548, 0.95793296, 2.05599471, 3.16647458,
         4.09092487, 4.69806488, 5.16141802, 5.31913661, 5.17047855,
         4.65077782, 3.73202258, 3.05482472, 3.04826275, 3.82320833],
        [3.87581379, 2.69234133, 1.4997384 , 1.18265142, 2.2762774 ,
         3.47152227, 4.38594529, 5.08351792, 5.50074669, 5.59075094,
         5.24091182, 4.36578664, 3.39706496, 2.84666483, 2.90868504],
        [4.50271157, 3.50408729, 2.57992109, 1.47632639, 1.40480571,
         2.56987967, 3.7050912 , 4.66047375, 5.35281573, 5.66771282,
         5.48981062, 4.78452261, 3.92895252, 3.06337465, 2.50865552],
        [4.98626135, 4.12192656, 3.40393492, 2.49675858, 1.51741481,
         1.63210173, 2.72231408, 3.87465934, 4.84768532, 5.43050367,
         5.46515397, 4.97230468, 4.40281623, 3.65744415, 2.71329723],
        [5.19368038, 4.63591082, 4.09756775, 3.33908446, 2.46537487,
         1.48916824, 1.5790048 , 2.68494058, 3.93449725, 4.80984382,
         5.106193  

In [144]:
xs, xt = X_train_arabic[0], X_test_arabic[0]
xs = subsequences_multivariate(xs, 10)
xt = subsequences_multivariate(xt, 10)
xs.shape, xt.shape, X_train_arabic[0].shape, X_test_arabic[0].shape

((28, 10, 13), (18, 10, 13), (37, 13), (27, 13))

In [145]:
a = cost_matrix_aw(xs, xt)
a.shape, a

((28, 18),
 array([[16.74837578, 18.57405355, 20.62559658, 22.33089174, 23.41404617,
         23.88138955, 24.06304208, 24.43016653, 24.09362315, 22.72499319,
         21.94123684, 19.94600327, 19.08419337, 18.65372775, 18.6654895 ,
         19.2155159 , 20.22841408, 21.65655729],
        [13.8974213 , 16.64639955, 18.48103369, 20.10206654, 21.78723461,
         22.69442834, 23.34264336, 23.94986631, 23.82862566, 22.67816497,
         21.65823822, 20.30237708, 18.06313061, 17.42863998, 17.27486134,
         18.25869487, 19.48031352, 20.74954325],
        [11.98866702, 14.14225901, 16.80049554, 18.18481624, 19.77239939,
         21.5167779 , 22.71667503, 23.46373873, 23.41694589, 22.47098759,
         21.69579374, 20.17382834, 18.7754575 , 16.57430239, 16.21988041,
         16.95733762, 18.67226976, 20.24133792],
        [11.81402883, 12.46009308, 14.52405204, 16.8042646 , 18.20426532,
         19.83905634, 21.83484651, 23.25359354, 23.36902606, 22.47787741,
         21.94374492, 20.709