# UCR data

## SVM

In [1]:
import logging
logging.basicConfig(level=logging.DEBUG)
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

from wtk.utilities import get_ucr_dataset, krein_svm_grid_search
from wtk import transform_to_dist_matrix, get_kernel_matrix, wtk_distance

### Read UCR data

In [2]:
X_train, y_train, X_test, y_test = get_ucr_dataset('../data/UCR/raw_data/', 'ItalyPowerDemand')

In [4]:
len(X_train), len(X_test), X_train[0].shape

(67, 1029, (24,))

### Compute wasserstein distance matrices with subsequent length $k=10$


In [16]:
D_train, D_test = transform_to_dist_matrix(X_train, X_test, 10)

In [17]:
D_train

array([[0.        , 1.38326055, 3.07846106, ..., 1.45252306, 1.90465423,
        2.25061774],
       [1.38326055, 0.        , 3.29482441, ..., 0.56957769, 1.21435146,
        2.67014487],
       [3.07846106, 3.29482441, 0.        , ..., 3.41526723, 3.37167033,
        1.72739061],
       ...,
       [1.45252306, 0.56957769, 3.41526723, ..., 0.        , 1.12726491,
        2.84850107],
       [1.90465423, 1.21435146, 3.37167033, ..., 1.12726491, 0.        ,
        2.811911  ],
       [2.25061774, 2.67014487, 1.72739061, ..., 2.84850107, 2.811911  ,
        0.        ]])

In [None]:
y_train

array([1., 1., 2., 2., 1., 1., 2., 1., 1., 2., 2., 1., 1., 2., 1., 2., 1.,
       1., 2., 1., 1., 2., 1., 1., 1., 1., 1., 2., 2., 1., 1., 2., 2., 1.,
       2., 2., 1., 2., 1., 2., 1., 1., 2., 2., 1., 2., 2., 2., 2., 1., 1.,
       2., 2., 2., 1., 2., 2., 1., 1., 2., 2., 1., 1., 2., 1., 2., 2.])

### Run the grid search

In [5]:
svm_clf = krein_svm_grid_search(D_train, D_test, y_train, y_test)

INFO:root:Starting analysis


TypeError: unsupported operand type(s) for +: 'dict' and 'dict'

### Alternatively: Get the kernel matrices computed from the distance matrices ...

In [22]:
K_train = get_kernel_matrix(D_train, psd=True, gamma=0.2)
K_test = get_kernel_matrix(D_test, psd=False, gamma=0.2)
K_train, K_test

(array([[1.        , 0.75831826, 0.54026679, ..., 0.74788608, 0.68322514,
         0.63754938],
        [0.75831826, 1.        , 0.51738661, ..., 0.89233332, 0.78437325,
         0.58623827],
        [0.54026679, 0.51738661, 1.        , ..., 0.50507242, 0.5094956 ,
         0.70788182],
        ...,
        [0.74788608, 0.89233332, 0.50507242, ..., 1.        , 0.79815459,
         0.565695  ],
        [0.68322514, 0.78437325, 0.5094956 , ..., 0.79815459, 1.        ,
         0.56984995],
        [0.63754938, 0.58623827, 0.70788182, ..., 0.565695  , 0.56984995,
         1.        ]]),
 array([[0.69431375, 0.63741963, 0.68224908, ..., 0.60610564, 0.61017478,
         0.83873557],
        [0.71704914, 0.87580403, 0.51645618, ..., 0.84899254, 0.85177573,
         0.58106967],
        [0.68992538, 0.80073716, 0.49604896, ..., 0.79785268, 0.8568112 ,
         0.57101492],
        ...,
        [0.72431506, 0.89364112, 0.50613727, ..., 0.90441957, 0.81286601,
         0.56505763],
        [0.5

### ... and train your own classifier

In [19]:
clf = SVC(C=5, kernel='precomputed')
clf.fit(K_train, y_train)
y_pred = clf.predict(K_test)
accuracy_score(y_test, y_pred)

0.9640427599611273

## K-NN


In [5]:
subs_2 = wtk_distance(X_train[0], X_train[20], 10)
subs_2

1.338264444061254

In [3]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier


def knn_classifier_from_distance_matrix(distance_matrix, k, labels):
    knn_clf = KNeighborsClassifier(
        n_neighbors=k, algorithm="brute", metric="precomputed"
    )
    n_train_samples = distance_matrix.shape[1]
    knn_clf.fit(np.random.rand(n_train_samples, n_train_samples), labels)
    predicted_labels = knn_clf.predict(distance_matrix)
    return predicted_labels


In [4]:
from tqdm import tqdm

train_size = len(X_train)
test_size = len(X_test)
result = np.zeros((test_size, train_size))
for train_idx in tqdm(range(train_size)):
    for test_idx in tqdm(range(test_size), leave=False):
        distance = wtk_distance(X_train[train_idx], X_test[test_idx], 10)
        result[test_idx, train_idx] = distance

  0%|          | 0/67 [00:00<?, ?it/s]

100%|██████████| 67/67 [00:25<00:00,  2.65it/s]


In [5]:
y_pred = knn_classifier_from_distance_matrix(
        distance_matrix=result,
        k=3,
        labels=y_train,
    )

In [6]:
accuracy_score(y_test, y_pred)

0.9572400388726919

In [13]:
y_pred_1 = knn_classifier_from_distance_matrix(
        distance_matrix=K_train,
        k=3,
        labels=y_train,
    )

In [15]:
y_pred_1

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

# Arabic

In [34]:
from wtk.arabicData import get_test_data, get_train_data
import os
import numpy as np

In [35]:
TRAIN_FILE = os.path.join('C:/Users/hoang/OneDrive/Documents/Machine Learning/LAB/WTK/data/UCR/raw_data/arabic/Train_Arabic_Digit.txt')
TEST_FILE = os.path.join('C:/Users/hoang/OneDrive/Documents/Machine Learning/LAB/WTK/data/UCR/raw_data/arabic/Test_Arabic_Digit.txt')

In [36]:
TRAIN_FILE, TEST_FILE

('C:/Users/hoang/OneDrive/Documents/Machine Learning/LAB/WTK/data/UCR/raw_data/arabic/Train_Arabic_Digit.txt',
 'C:/Users/hoang/OneDrive/Documents/Machine Learning/LAB/WTK/data/UCR/raw_data/arabic/Test_Arabic_Digit.txt')

In [37]:
X_train_arabic, y_train_arabic = get_train_data("C:/Users/hoang/OneDrive/Documents/Machine Learning/LAB/WTK/data/UCR/raw_data/arabic/Train_Arabic_Digit.txt")
X_test_arabic, y_test_arabic = get_test_data("C:/Users/hoang/OneDrive/Documents/Machine Learning/LAB/WTK/data/UCR/raw_data/arabic/Test_Arabic_Digit.txt")

In [38]:
def findTheShortestArray(list1, list2, min_subsequence):
    min1 = min(list1, key=len)
    min2 = min(list2, key=len)
    return min(len(min1), len(min2)) - min_subsequence + 1

In [40]:
kernel_size = findTheShortestArray(X_train_arabic, X_test_arabic, 10)
kernel_size

43

In [44]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def knn_classifier_from_distance_matrix(distance_matrix, k, labels):
    knn_clf = KNeighborsClassifier(
        n_neighbors=k, algorithm="brute", metric="precomputed"
    )
    n_train_samples = distance_matrix.shape[1]
    knn_clf.fit(np.random.rand(n_train_samples, n_train_samples), labels)
    predicted_labels = knn_clf.predict(distance_matrix)
    return predicted_labels


In [45]:
from tqdm import tqdm

train_size = len(X_train_arabic)
test_size = len(X_test_arabic)
kernel_size = findTheShortestArray(X_train_arabic, X_test_arabic, 10)
result = np.zeros((test_size, train_size))
for train_idx in tqdm(range(train_size)):
    for test_idx in tqdm(range(test_size), leave=False):
        distance = wtk_distance(X_train_arabic[train_idx], X_test_arabic[test_idx], kernel_size)
        result[test_idx, train_idx] = distance

  0%|          | 0/6600 [00:00<?, ?it/s]

  0%|          | 0/6600 [00:40<?, ?it/s]


KeyboardInterrupt: 

In [33]:
y_pred = knn_classifier_from_distance_matrix(
        distance_matrix=result,
        k=3,
        labels=y_train_arabic,
    )

In [35]:
accuracy_score(y_test_arabic, y_pred)

0.7777272727272727