In [1]:
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1
%env ANYWIDGET_DEV=1

env: ANYWIDGET_HMR=1
env: ANYWIDGET_DEV=1


In [2]:
from typing import Any

import networkx as nx
import numpy as np
from datasets import load_dataset
from scipy.cluster.hierarchy import dendrogram, fcluster, linkage
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import DistanceMetric

from seq import Widget


True


In [3]:
ds = load_dataset("ajaykarthick/imdb-movie-reviews")["test"]

In [4]:
w = Widget(
  sequences=ds["review"],
  max_length=32,
  n_features=10,
)
raw_sequences = w.sequences
w

Widget(labels=[{'id': 58, 'token': 'movie'}, {'id': 4, 'token': 'film'}, {'id': 29, 'token': 'one'}, {'id': 10…

In [5]:
n_sequence = 10000
w.sequences = raw_sequences[:n_sequence].copy()
label_ids = [label["id"] for label in w.labels]

In [6]:
def mask_non_featured_sequences(
  sequences: list[Any], label_ids: list[int]
) -> np.ndarray:
  sequences = np.array(sequences)
  mask = np.isin(sequences, label_ids)

  return sequences * mask


labeld_sequences = mask_non_featured_sequences(raw_sequences, label_ids)
w.sequences = labeld_sequences.tolist()

In [7]:
import numpy as np


def distance(x: np.ndarray) -> np.ndarray:
  mask = x > 0
  and_masks = mask[:, np.newaxis, :] & mask[np.newaxis, :, :]
  xor_matrix = x[:, np.newaxis, :] != x[np.newaxis, :, :]
  distances = np.sum(xor_matrix & and_masks, axis=2)

  return distances / 32


In [8]:
h_distance = DistanceMetric.get_metric("hamming").pairwise(labeld_sequences)
print(h_distance.shape)
print(h_distance)
print(squareform(h_distance).shape)

hh_distance = distance(labeld_sequences)
print(hh_distance.shape)
print(hh_distance)
print(squareform(hh_distance).shape)

(10000, 10000)
[[0.      0.15625 0.15625 ... 0.15625 0.125   0.15625]
 [0.15625 0.      0.125   ... 0.15625 0.15625 0.1875 ]
 [0.15625 0.125   0.      ... 0.1875  0.15625 0.1875 ]
 ...
 [0.15625 0.15625 0.1875  ... 0.      0.15625 0.1875 ]
 [0.125   0.15625 0.15625 ... 0.15625 0.      0.125  ]
 [0.15625 0.1875  0.1875  ... 0.1875  0.125   0.     ]]
(49995000,)
(10000, 10000)
[[0.      0.      0.      ... 0.      0.      0.     ]
 [0.      0.      0.      ... 0.03125 0.      0.     ]
 [0.      0.      0.      ... 0.      0.      0.     ]
 ...
 [0.      0.03125 0.      ... 0.      0.      0.     ]
 [0.      0.      0.      ... 0.      0.      0.03125]
 [0.      0.      0.      ... 0.      0.03125 0.     ]]
(49995000,)


In [9]:
def cluster_sequences(sequences: list[Any]) -> np.ndarray:
  sequences = np.array(sequences)
  # dist = distance(sequences)
  dist = DistanceMetric.get_metric("hamming").pairwise(sequences)
  dist = squareform(dist)
  linkage_matrix = linkage(dist, method="average")
  dendrogram_data = dendrogram(linkage_matrix, no_plot=True)
  order = dendrogram_data["leaves"]
  order = np.array(order)
  return np.array(sequences)[order]


clustered_sequences = cluster_sequences(raw_sequences)
w.sequences = clustered_sequences.tolist()

In [10]:
def mask_sequences(sequences: list[Any], window_length: int) -> list[Any]:
  unmasked = np.array(sequences.copy())
  # 윈도우 크기만큼의 마스크 생성
  masks = []

  for i in range(2 * window_length):
    left = unmasked[i : -(2 * window_length - i), :]
    right = unmasked[i + 1 : -(2 * window_length - i - 1) or None, :]
    masks.append(left != right)

  mask = np.logical_or.reduce(masks)

  # 마스킹 적용
  unmasked[window_length:-window_length, :][mask] = 0
  return unmasked


masked_sequences = mask_sequences(clustered_sequences, window_length=3)
w.sequences = masked_sequences.tolist()

In [11]:
def filter_sequences(sequences: list[Any], filter_length: int) -> list[Any]:
  sequences = np.array(sequences)
  mask = np.sum((sequences != 0) & (sequences != -1), axis=1) > filter_length

  sequences = sequences[mask]

  return sequences


filtered_sequences = filter_sequences(masked_sequences, filter_length=1)
w.sequences = filtered_sequences.tolist()

In [12]:
# check all zero in the sequence
print(filtered_sequences.shape)
np.sum(filtered_sequences, axis=1)

(236, 32)


array([347223, 657244, 374126,   3399,   3399,   1713,   1713,   1713,
         1713,    342,      8,   3329,   7501,   7501,   1026,   1026,
         1026,   1132,      8,    426,    426,    426,   1454,   1454,
         1454,      8,      8,      8,    341,    131,    133,    133,
          810,    915,    915,    915,    915,    915,    973,    236,
          236,    236,    236,    940,    940,    940,   1046,   1046,
          240,    240,    134,    211,    211,    211,    163,    163,
          163,    809,    809,    809,    809,    809,    809,    809,
          809,    809,    915,    915,    915,   1191,   1294,    116,
          116,    116,    116,   2384,   2384,   2418,   2418,   2418,
         2418,   2418,   2418,   2418,   2418,   2418,   2418,   2418,
         2418,   2418,   2418,   2418,   2418,   2418,   2418,   2418,
         2418,   2418,   2418,   2476,   2476,   2476,   1716,   1716,
         1716,    116,    116,    116,    396,    116,    169,    169,
      

In [13]:
def sort_sequences(sequences: list[Any]):
  sequences = np.array(sequences)
  unique_sequences, count = np.unique(sequences, axis=0, return_counts=True)
  dist_matrix = DistanceMetric.get_metric("hamming").pairwise(unique_sequences)

  G = nx.Graph()
  for i in range(len(dist_matrix)):
    for j in range(i + 1, len(dist_matrix)):
      G.add_edge(i, j, weight=dist_matrix[i, j])

  answer = nx.algorithms.approximation.christofides(G)
  # answer = nx.algorithms.approximation.traveling_salesman_problem(G)
  sorted_unique_sequences = unique_sequences[np.array(answer[:-1])]
  sorted_original_sequences = []

  for i, unique_sequence in enumerate(sorted_unique_sequences):
    sorted_original_sequences.extend([unique_sequence] * count[i])

  return np.array(sorted_original_sequences)


print(filtered_sequences.shape)
sorted_sequences = sort_sequences(filtered_sequences)
print(sorted_sequences.shape)
w.sequences = sorted_sequences.tolist()

(236, 32)
(236, 32)


In [14]:
unique_sequences1, count1 = np.unique(
  filtered_sequences, axis=0, return_counts=True
)
unique_sequences2, count2 = np.unique(
  sorted_sequences, axis=0, return_counts=True
)
print(count1)
print(count2)

[ 1  1  3  7  1  3  1  2  2  3  3  1  2  3  5  1  1  2  3  9  9  3  2  1
  2  7  2  1  1  1  2  1  1  2  1  5  1  1  1  1  2  1  4  2  3  2  5  2
  1  5  1  1  1  1  2  3  1  3 26  3  3  1  1  1  1  4  2 22  3  1  3 20
  1  2  1  1  1]
[ 1  1  1  2  3 20  1  2  1  1  4  1  9  1  1  2  1  1  1  3 22  3  2  3
  1  1  2  1  3  2  5  7  2  5  1  5  1  1  1  1  1  9  2  1  1  2  3  2
  1  3  3  3  1  1 26  2  1  4  2  3  1  3  2  1  3  1  3  2  1  7  1  2
  1  2  1  5  3]
