In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from model import OursModel
import utils

In [2]:
device = utils.init_dl_program(0, seed=0, max_threads=8)

In [3]:
def is_nan_dataset(dataset):
    nan_dataset = [
        'DodgerLoopDay',
        'DodgerLoopGame',
        'DodgerLoopWeekend',
        'MelbournePedestrian',
        'Missing_value_and_variable_length_datasets_adjusted'
    ]
    for nan_set in nan_dataset:
        if dataset.startswith(nan_set):
            return True    
    return False

In [4]:
data_path = os.path.join('./data', 'UCRArchive_2018')
data_name_list = [x[0][len(data_path) + 1:] for x in os.walk(data_path)][1:]
train_list = []
for i in range(len(data_name_list)):
    if is_nan_dataset(data_name_list[i]):
        continue
    train, train_labels, test, test_labels = utils.load_UCR_dataset(data_path, data_name_list[i])
    varying = bool(np.isnan(np.sum(train)))
    if varying:
        continue
    print(data_name_list[i], train.shape)
    train_list.append(data_name_list[i])
print(len(train_list))

ACSF1 (100, 1460)
Adiac (390, 176)
ArrowHead (36, 251)
Beef (30, 470)
BeetleFly (20, 512)
BirdChicken (20, 512)
BME (30, 128)
Car (60, 577)
CBF (30, 128)
Chinatown (20, 24)
ChlorineConcentration (467, 166)
CinCECGTorso (40, 1639)
Coffee (28, 286)
Computers (250, 720)
CricketX (390, 300)
CricketY (390, 300)
CricketZ (390, 300)
Crop (7200, 46)
DiatomSizeReduction (16, 345)
DistalPhalanxOutlineAgeGroup (400, 80)
DistalPhalanxOutlineCorrect (600, 80)
DistalPhalanxTW (400, 80)
Earthquakes (322, 512)
ECG200 (100, 96)
ECG5000 (500, 140)
ECGFiveDays (23, 136)
ElectricDevices (8926, 96)
EOGHorizontalSignal (362, 1250)
EOGVerticalSignal (362, 1250)
EthanolLevel (504, 1751)
FaceAll (560, 131)
FaceFour (24, 350)
FacesUCR (200, 131)
FiftyWords (450, 270)
Fish (175, 463)
FordA (3601, 500)
FordB (3636, 500)
FreezerRegularTrain (150, 301)
FreezerSmallTrain (28, 301)
Fungi (18, 201)
GunPoint (50, 150)
GunPointAgeSpan (135, 150)
GunPointMaleVersusFemale (135, 150)
GunPointOldVersusYoung (136, 150)
Ham (

In [18]:
data_name = 'ucr_all'
# train_data, train_labels, test_data, test_labels = utils.load_UCR_dataset('data/processed', data_name)
# # train_data, train_labels, test_data, test_labels = utils.load_UCR_dataset('data/UCRArchive_2018', data_name)
# print(train_data.shape)

In [19]:
config = dict(
        classes=3,
        sim_fun='cosine',
        cate_fun='softmax',
    )

In [20]:
model = OursModel(
        device=device,
        **config
    )

In [21]:
model_dir = 'training/' + data_name + '__' + 'valid' + '_' + '20210708_092611/'
# model_dir = 'training/' + data_name + '__' + 'valid' + '_' + '20210712_062350/'

In [22]:
model.load(model_dir + 'model_i2000.pth')

In [10]:
data_name = 'ucr_all_label'
# train_data, train_labels, test_data, test_labels = utils.load_UCR_dataset('data/UCRArchive_2018', data_name)
train_data, train_labels, test_data, test_labels = utils.load_UCR_dataset('data/processed', data_name)
print(train_data.shape)
print(test_data.shape)

(35041, 2845)
(35041, 2845)


In [11]:
in_data_labels = train_data[:,0]
train_data = train_data[:,1:]
print(in_data_labels.shape)
print(train_data.shape)

(35041,)
(35041, 2844)


In [12]:
in_data_labels = in_data_labels.astype(int)
print(in_data_labels)

[9 9 9 ... 1 1 1]


In [None]:
model.net.cate_fun = 'gumbel'

In [None]:
cluster_features = model.encode(train_data, mode='class')
print(cluster_features.shape)

In [None]:
cates = model.encode(test_data, mode='cate').cpu().detach().numpy()
print(cates)

In [None]:
cates_labels = np.argmax(cates, axis=1)
print(cates_labels)

In [23]:
features = model.encode(train_data)

In [None]:
norm = torch.linalg.vector_norm(features, dim=-1).cpu().detach().numpy()
print(norm.shape)
plt.hist(norm, bins=100)
plt.title(data_name)

In [None]:
features = model.encode(train_data).cpu().numpy()
svm = utils.fit_svm(features, train_labels)
test_features = model.encode(test_data).cpu().numpy()
pred_labels = svm.predict(test_features)

train_score = svm.score(features, train_labels)
score = svm.score(test_features, test_labels)
print(train_score, score)

In [None]:
pred = svm.predict(test_features)
test_features = torch.from_numpy(test_features)

In [None]:
features_list = model.encode(test_data, mode='list')

In [None]:
features_list = torch.split(features_list, 320, dim=1)

In [None]:
for i in range(3):
    features_k = features_list[i][cates_labels==i]
    norm = torch.linalg.vector_norm(features_k, dim=-1).cpu().detach().numpy()
    plt.hist(norm, bins=100)
    plt.title(data_name)
    print(features_k.shape)

In [24]:
def nearest_neighbor(features, i, k=10):
    # find the nearest neighbor of features[i]
    # cates_logits = torch.mm(features[i].reshape(1,-1), features.transpose(0,1))
    cates_logits = torch.cosine_similarity(features[i].reshape(1,-1), features)
    # cates_logits = F.pairwise_distance(features[i], features, p=2)
    arg = torch.topk(cates_logits, k=k, dim=-1, largest=True)
    # print(arg)
    return {
        'value': arg[0].cpu().numpy().reshape(-1,),
        'index': arg[1].cpu().numpy().reshape(-1,)
    }

In [25]:
res_list = []
for i in range(len(train_list)):
    indexes = np.where(train_labels == i)[0]
    # print(indexes)
    data_len = len(indexes)
    knn_matrix = np.zeros((data_len, 1), dtype=np.int)
    for j in range(data_len):
        nearest_index = nearest_neighbor(features, indexes[j], k=2)['index'][-1]
        # print(nearest_index)
        knn_matrix[j] = nearest_index

    data_label = train_labels[indexes]
    in_data_label = in_data_labels[indexes]
    knn_data_labels = train_labels[knn_matrix].reshape(-1)
    knn_in_data_labels = in_data_labels[knn_matrix].reshape(-1)
    # print(data_label, in_data_label, knn_data_labels, knn_in_data_labels)

    data_right = sum(knn_data_labels == data_label)
    label_right = sum((knn_in_data_labels == in_data_label) * (knn_data_labels == data_label))
    res = {
        'data_name': train_list[i],
        'data_right_rate': data_right / data_len,
        'label_right_rate': label_right / data_right,
    }
    res_list.append(res)
    # break

output_res = pd.DataFrame(res_list)
print(output_res)

  'label_right_rate': label_right / data_right,


         data_name  data_right_rate  label_right_rate
0            ACSF1         1.000000          0.660000
1            Adiac         0.997436          0.629820
2        ArrowHead         1.000000          0.777778
3             Beef         0.900000          0.259259
4        BeetleFly         0.750000          0.933333
..             ...              ...               ...
108           Wine         1.000000          1.000000
109   WordSynonyms         0.831461          0.752252
110          Worms         1.000000          1.000000
111  WormsTwoClass         0.000000               NaN
112           Yoga         0.990000          0.818182

[113 rows x 3 columns]


In [26]:
output_res.to_csv('res_ucr_all.csv', na_rep="nan")

In [65]:
indexes = np.random.permutation(np.where(train_labels == 83)[0])[:50]
# indexes = np.arange(train_data.shape[0])[:100]
# indexes = np.where(pred != test_labels)[0]
print(indexes.shape)
k = 10
knn_matrix = np.zeros((len(indexes), k), dtype=np.int)
knn_value_matrix = np.zeros((len(indexes), k))
for i in range(len(indexes)):
    arg = nearest_neighbor(features, indexes[i], k=k)
    # arg = nearest_neighbor(test_features, indexes[i], k=k)
    knn_value = arg['value']
    nearest_index = arg['index']
    knn_matrix[i] = nearest_index
    knn_value_matrix[i] = knn_value
knn_labels = train_labels[knn_matrix]
knn_in_data_labels = in_data_labels[knn_matrix]
# knn_labels = test_labels[knn_matrix]
print(knn_matrix)
print(knn_labels)
print(knn_in_data_labels)

(50,)
[[23407 23856 24246 24310 24005 24316 23999 23912 24309 23891]
 [23411 23858 24338 24357 24096 23462 24369 24104 23501 24241]
 [23602 24026 24452 23777 23629 24036 24079 24467 23609 24032]
 [23675 24131 24490 24457 23962 23617 23579 23943 24437 23691]
 [23583 24098 24529 23794 24528 23581 24545 23809 23616 23614]
 [23555 23819 24212 23479 23798 24187 23822 24215 24193 23804]
 [23575 24016 24525 24537 24030 23605 24057 23673 24569 24048]
 [23609 24032 24454 24467 24036 23629 24452 24026 24517 23565]
 [23567 23787 24433 24071 24597 23652 23825 24478 23853 23585]
 [23504 23956 24370 24408 23993 24396 23978 23924 24425 23973]
 [23514 23735 24375 23444 23718 24350 24368 23732 23498 23954]
 [23541 24045 24383 24064 24410 24066 24394 24411 24055 24053]
 [23698 23842 24583 24269 23959 23509 24499 24063 23694 23891]
 [23547 24118 24282 23930 23914 24504 23615 23881 24313 24346]
 [23605 24030 24537 23575 24016 24525 24057 23673 24569 24025]
 [23591 24100 24446 24169 24014 23440 23695 23907

In [54]:
def plot_ts(plt, ts, label, in_data_label, index, value, color=None):
    x = range(ts[index].shape[-1])
    if color:
        plt.plot(x, ts[index], linewidth=2, color=color)
    else:
        plt.plot(x, ts[index], linewidth=2)
    plt.set_title(f"index {index} data {label} label {in_data_label} value {np.round(value,3)}")

def plot_knn(ts, knn_labels, knn_in_data_labels, knn_matrix, knn_value_matrix):
    k = knn_matrix.shape[1]
    ts_num = knn_matrix.shape[0]
    fig = plt.figure(figsize=(8.0*k, 6.0*ts_num))
    for i in range(ts_num):
        ax = fig.add_subplot(ts_num, k, i*k + 1)
        plot_ts(ax, ts, knn_labels[i,0], knn_in_data_labels[i,0], knn_matrix[i,0], knn_value_matrix[i,0], color='red')
        for j in range(k-1):
            ax = fig.add_subplot(ts_num, k, i*k + j + 2)
            plot_ts(ax, ts, knn_labels[i,j+1], knn_in_data_labels[i,j+1], knn_matrix[i,j+1], knn_value_matrix[i,j+1])

In [None]:
# plot_ts(test_data, test_labels, pred_labels, np.random.permutation(np.where(cates_labels == 2)[0])[:10])
plot_knn(train_data, knn_labels, knn_in_data_labels, knn_matrix, knn_value_matrix)
# plot_knn(test_data, knn_labels, knn_matrix, knn_value_matrix)

In [None]:
cores = torch.nn.functional.normalize(model.net.cores)

In [None]:
features_with_cores = torch.cat([cluster_features, cores], dim=0)
print(features_with_cores.shape)

In [None]:
# features_numpy = features_with_cores.cpu().detach().numpy()
# features_numpy = features.cpu().detach().numpy()
# features_numpy = features_list[2].cpu().detach().numpy()
features_numpy = test_features
# features_numpy = features
print('Computing t-SNE embedding')
tsne = TSNE(n_components=2, init='pca', random_state=0)
tsne_result = tsne.fit_transform(features_numpy)
print('t-SNE finished')

In [None]:
def plot_embedding(data, label, title, xlim=(-80,80), ylim=(-80,80), rgb=False, rgb_value=None):
    # x_min, x_max = np.min(data, 0), np.max(data, 0)
    # data = (data - x_min) / (x_max - x_min)
    
    label_num = len(np.unique(label))
 
    fig = plt.figure()
    plt.xlim(xlim)
    plt.ylim(ylim)
#     plt.scatter(data[:,0], data[:,1], c=label, s=8)
    for i in range(data.shape[0]):
        if rgb:
            plt.text(data[i, 0], data[i, 1], str(label[i]), color=(rgb_value[i][0], rgb_value[i][1], rgb_value[i][2]), fontdict={'size': 8})
        else:
            plt.text(data[i, 0], data[i, 1], str(label[i]), color=plt.cm.Set1(label[i] / label_num), fontdict={'size': 8})
    # plt.xticks([])
    # plt.yticks([])
    plt.title(title)
    return fig

In [None]:
# plot_embedding(tsne_result, test_labels, f'{data_name} gumbel', (-40,45), (-30,55))
# plot_embedding(tsne_result, pred_labels, f'{data_name} gumbel pred', (-40,45), (-30,55))
# plot_embedding(tsne_result, test_labels, f'{data_name} softmax', (-40,40), (-40,50))
plot_embedding(tsne_result, pred_labels, f'{data_name} softmax pred', (-40,40), (-40,50))
print()

In [None]:
plot_embedding(tsne_result[:-3], cates_max, '6_data', 80, rgb=True, rgb_value=cates)
print()