In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
import numpy as np
from tqdm import tqdm

from seq.model import Chunk, ChunkDatabase, Cluster

np.random.seed(42)

In [137]:
data = np.random.randint(1, 5, (100_000, 32))

In [151]:
db = ChunkDatabase(data, threshold=0.001, max_chunk_length=5)
sorted(db.length_map[5], key=lambda x: len(x), reverse=True)[:10]

100%|██████████| 5/5 [00:06<00:00,  1.33s/it]


[Chunk([2 4 2 1 2], 0, 5, #130),
 Chunk([3 2 1 2 3], 0, 5, #130),
 Chunk([3 1 3 4 2], 26, 31, #129),
 Chunk([4 4 1 2 2], 5, 10, #127),
 Chunk([1 1 4 3 2], 26, 31, #127),
 Chunk([1 3 1 1 2], 26, 31, #127),
 Chunk([4 3 1 2 4], 5, 10, #126),
 Chunk([1 3 2 2 3], 26, 31, #126),
 Chunk([1 3 4 4 1], 10, 15, #125),
 Chunk([2 2 3 4 2], 26, 31, #125)]

In [29]:
data = np.random.randint(1, 5, (100_000, 32))

chunk_length = 10

chunk_view = np.lib.stride_tricks.sliding_window_view(
  data, (1, chunk_length)
).reshape(data.shape[0], data.shape[1] - chunk_length + 1, chunk_length)

In [33]:
print(chunk_view[:, 1].shape)
np.unique(chunk_view[:, 1], axis=0, return_counts=True)[0].shape

(100000, 10)


(95299, 10)

In [23]:
chunks = db.chunks
len(chunks)

7493

In [24]:
chunks

[Chunk([1 3 3 3 4], 14, 19, #135),
 Chunk([4 3 3 2 2], 23, 28, #134),
 Chunk([4 2 3 1 1], 25, 30, #134),
 Chunk([2 1 2 1 3], 27, 32, #133),
 Chunk([3 1 3 3 1], 10, 15, #132),
 Chunk([1 3 4 1 4], 6, 11, #131),
 Chunk([3 3 1 3 4], 25, 30, #131),
 Chunk([4 1 2 1 1], 27, 32, #131),
 Chunk([2 4 4 3 1], 27, 32, #131),
 Chunk([2 1 4 4 2], 13, 18, #130),
 Chunk([3 1 1 4 3], 16, 21, #129),
 Chunk([1 1 2 3 2], 23, 28, #129),
 Chunk([4 1 3 3 1], 5, 10, #128),
 Chunk([1 1 3 4 4], 19, 24, #128),
 Chunk([2 4 2 3 2], 21, 26, #128),
 Chunk([1 1 2 1 1], 24, 29, #128),
 Chunk([2 4 1 4 4], 24, 29, #128),
 Chunk([3 3 3 3 3], 26, 31, #128),
 Chunk([1 3 4 2 1], 27, 32, #128),
 Chunk([2 2 4 2 3], 5, 10, #127),
 Chunk([4 4 1 3 4], 14, 19, #127),
 Chunk([3 3 1 1 4], 15, 20, #127),
 Chunk([2 2 3 3 3], 9, 14, #126),
 Chunk([3 4 1 3 1], 14, 19, #126),
 Chunk([1 2 1 2 4], 16, 21, #126),
 Chunk([4 2 1 3 3], 25, 30, #126),
 Chunk([2 4 1 1 2], 26, 31, #126),
 Chunk([3 4 3 2 3], 0, 5, #125),
 Chunk([3 2 1 1 2], 4, 9, 

In [136]:
processed_chunks = set()
clusters: list[Cluster] = []

for chunk in chunks:
  if chunk in processed_chunks:
    continue

  cluster = Cluster(chunk)
  left = cluster.left
  right = cluster.right

  left_candidate = [
    c
    for c in db.get_candidate(cluster.left)
    if c not in processed_chunks and cluster.can_appendleft(c)
  ]

  right_candidate = [
    c
    for c in db.get_candidate(cluster.right)
    if c not in processed_chunks and cluster.can_append(c)
  ]

  while len(left_candidate) > 0 or len(right_candidate) > 0:
    if len(left_candidate):
      cluster.appendleft(left_candidate[0])
      processed_chunks.add(left_candidate[0])
    if len(right_candidate):
      cluster.append(right_candidate[0])
      processed_chunks.add(right_candidate[0])

    left_candidate = [
      c
      for c in db.get_candidate(cluster.left)
      if c not in processed_chunks and cluster.can_appendleft(c)
    ]

    right_candidate = [
      c
      for c in db.get_candidate(cluster.right)
      if c not in processed_chunks and cluster.can_append(c)
    ]

  clusters.append(cluster)

In [137]:
print(len(clusters))
clusters = sorted(clusters, key=lambda c: c.size, reverse=True)

16


In [138]:
[c.height for c in clusters[:10]]

[100000,
 100000,
 100000,
 100000,
 100000,
 100000,
 100000,
 100000,
 100000,
 100000]

In [18]:
[len(c) for c in clusters[:10]]

[33, 15, 11, 17, 20, 16, 21, 7, 11, 7]

In [19]:
clusters[0][0]

Chunk([4 4 1], 28, 31, #38541)

In [20]:
canvas = np.zeros((data.shape[0] * 100, data.shape[1]))
canvas.shape

(10000000, 32)

In [22]:
clusters[0][0].seq_indices

{np.int64(0),
 np.int64(1),
 np.int64(4),
 np.int64(8),
 np.int64(10),
 np.int64(11),
 np.int64(18),
 np.int64(19),
 np.int64(25),
 np.int64(27),
 np.int64(28),
 np.int64(30),
 np.int64(31),
 np.int64(36),
 np.int64(37),
 np.int64(40),
 np.int64(45),
 np.int64(48),
 np.int64(49),
 np.int64(55),
 np.int64(57),
 np.int64(60),
 np.int64(64),
 np.int64(68),
 np.int64(70),
 np.int64(71),
 np.int64(73),
 np.int64(76),
 np.int64(77),
 np.int64(78),
 np.int64(80),
 np.int64(83),
 np.int64(84),
 np.int64(86),
 np.int64(87),
 np.int64(95),
 np.int64(96),
 np.int64(97),
 np.int64(99),
 np.int64(101),
 np.int64(102),
 np.int64(106),
 np.int64(107),
 np.int64(108),
 np.int64(109),
 np.int64(111),
 np.int64(113),
 np.int64(117),
 np.int64(120),
 np.int64(124),
 np.int64(127),
 np.int64(129),
 np.int64(130),
 np.int64(132),
 np.int64(133),
 np.int64(134),
 np.int64(135),
 np.int64(136),
 np.int64(138),
 np.int64(139),
 np.int64(140),
 np.int64(143),
 np.int64(146),
 np.int64(149),
 np.int64(154),
 np

In [7]:
canvas = np.zeros((data.shape[0] * 100, data.shape[1]))
positions = []
for cluster in tqdm(clusters, desc="Processing Clusters..."):
  start = cluster.start
  end = cluster.end
  subcanvas = canvas[:, start:end]

  cluster.draw(subcanvas, db.min_n_chunks)

  positions.append(position)
  if position != -1:
    canvas[position : position + shape.shape[0]][shape != 0] = shape[shape != 0]

Processing paths: 100%|██████████| 340/340 [01:23<00:00,  4.09it/s]


In [8]:
filtered_canvas = canvas[~np.all(canvas == 0, axis=1)]
print(canvas.shape)
print(filtered_canvas.shape)

(10000000, 32)
(5023629, 32)


In [9]:
np.sum(filtered_canvas != 0) / np.prod(filtered_canvas.shape)

np.float64(0.4317462908785661)

In [10]:
def print_row(row: np.ndarray):
  print("".join([f"{int(x)}" if x != 0 else " " for x in row]))

In [13]:
rows = []
prev_row = np.zeros(canvas.shape[1])
for i in range(0, canvas.shape[0]):
  row = canvas[i]
  if not np.array_equal(row, prev_row) and not np.sum(row) == 0:
    rows.append(row)
    print_row(row)
    prev_row = row

3413  11 2 423 1134   344 313311
3413  11 2 423   3443 344 313311
3413  11 2 423 1434   344 313311
3413  11 2 423 14     344 313311
3413  11 2 423 14   21344 313311
3413  11 2 423 14 4321344 313311
3413  11 2 423 14 4321 442313311
3413  11 2     14 4321 442313311
3413  11 2     14 4321 442313   
3413  11 2     14 4321 44231    
3413  11 2333  14 4321 44231331 
3413  11 2333  14 4321244 31331 
3413  11 2     14 4321244 31331 
3413  11 2     14 4321244 31    
3413  11 2 134 14 4321244 31441 
3413  11 2 134 14 432124  31441 
3413  11 2 134 14 432124  31    
3413  11 2     14 432124  31    
3413  11 2114  14 432124  313311
3413  11 2114  14 432124  31    
3413  11 2114  14 432124  314411
3413  11 2114  14 432124  31    
3413  11 2114  14 432124  312131
3413  11 2114  14 432124  31    
3413     2114  14 432124  31    
34133    2114  14 432124  31    
34133 1132114  14 432124  311421
34133 1132     14 432124  311421
34133 11322322 14 432124  311421
34133 11322322 14 432124  31    
34133 1132

In [None]:
len(rows)

1040