In [1]:
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1
%env ANYWIDGET_DEV=1

env: ANYWIDGET_HMR=1
env: ANYWIDGET_DEV=1


In [4]:
import numpy as np
from datasets import load_dataset

from seq import Widget
from seq.data import get_featured_ids, get_ids, get_tokenizer
from seq.utils import cluster_sequences, mask_small_clusters, sort_sequences

In [5]:
ds = load_dataset("neuralbioinfo/bacterial_promoters")

In [6]:
tokenizer = get_tokenizer(type="dna", k_mer=7)
ids, tokens = get_ids(
  ds["test_multispecies"]["segment"], tokenizer, max_tokens=32
)

100%|██████████| 22582/22582 [00:00<00:00, 65968.30it/s]


In [7]:
featured_ids = get_featured_ids(ids, tokenizer, n_features=100, method="count")
labels = [{"id": i, "label": tokenizer.id_to_token(i)} for i in featured_ids]

In [8]:
w = Widget(
  sequences=ids,
  labels=labels[0:10],
  width=800,
  height=1600,
  grid=False,
)
init_seq = w.sequences
w

Sequences:  (836, 32)
Rects:  742


Widget(height=1600, labels=[{'id': 10081, 'label': 'AAAATTT'}, {'id': 159, 'label': 'CGCCGCG'}, {'id': 438, 'l…

In [9]:
w.update_sequences(init_seq)

Sequences:  (836, 32)
Rects:  742


In [64]:
clustered_sequence = cluster_sequences(init_seq)
w.update_sequences(clustered_sequence)

Sequences:  (836, 32)
Rects:  133


In [67]:
masked_sequences = mask_small_clusters(clustered_sequence, min_cluster_size=10)
w.grid = True
w.update_sequences(masked_sequences)
w.height = 1600
count = np.unique(w.sequences, axis=0, return_counts=True)[1]
# unique count가 1인 행 반환
singletons = w.sequences[np.where(count == 1)]
singletons

Sequences:  (298, 32)
Rects:  21


array([[    0,     0,     0,     0, 10081,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])

In [59]:
sorted_sequences = sort_sequences(w.sequences)
w.update_sequences(sorted_sequences)

[0, 12, 11, 13, 10, 14, 9, 15, 8, 16, 7, 17, 6, 18, 4, 5, 20, 19, 3, 21, 2, 22, 1]
Sequences:  (298, 32)
Rects:  21


In [38]:
import numpy as np

# 2차원 행렬 생성
matrix = np.array([[1, 2], [3, 4], [1, 2], [5, 6]])

# 고유한 행 추출
unique_rows = np.unique(matrix, axis=0, return_counts=True)

print("고유한 행:")
print(unique_rows)


고유한 행:
(array([[1, 2],
       [3, 4],
       [5, 6]]), array([2, 1, 1]))
