In [3]:
%load_ext autoreload
%autoreload 2

In [192]:
import numpy as np
from tqdm import tqdm

from seq.model import Chunk, ChunkDatabase, Cluster

np.random.seed(42)

In [223]:
data = np.random.randint(1, 5, (10_000, 32))

In [280]:
db = ChunkDatabase(data, threshold=100, max_chunk_length=6)
db

  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [00:24<00:00,  4.08s/it]


ChunkDatabase Information
Data Shape: (100000, 32)
Threshold: 100
Max Chunk Length: 6
Number of Chunks: 12973
Number of Chunks by Length
5: 12077
4: 25
3: 387
2: 373
1: 111

In [379]:
processed_chunks = set()
clusters: list[Cluster] = []

for chunk in tqdm(db.chunks):
  if chunk in processed_chunks:
    continue

  cluster = Cluster([chunk, db.get_candidate(chunk)[0]])
  processed_chunks.add(cluster.left)
  processed_chunks.add(cluster.right)
  left_candidate = [
    c
    for c in db.get_candidate(cluster.left)
    if c not in processed_chunks and cluster.can_appendleft(c)
  ]

  right_candidate = [
    c
    for c in db.get_candidate(cluster.right)
    if c not in processed_chunks and cluster.can_append(c)
  ]

  while True:
    if len(left_candidate):
      cluster.appendleft(left_candidate[0])
      processed_chunks.add(left_candidate[0])
    if len(right_candidate):
      cluster.append(right_candidate[0])
      processed_chunks.add(right_candidate[0])

    left_candidate = [
      c
      for c in db.get_candidate(cluster.left)
      if c not in processed_chunks and cluster.can_appendleft(c)
    ]

    right_candidate = [
      c
      for c in db.get_candidate(cluster.right)
      if c not in processed_chunks and cluster.can_append(c)
    ]

    if len(left_candidate) == 0 and len(right_candidate) == 0:
      clusters.append(cluster)
      break

  0%|          | 0/12973 [00:00<?, ?it/s]

220 315 2
27 25
Chunk([2 3 1 3 3], 24, 29, #144) Chunk([1 2 3 1 3], 23, 28, #134)
189 313 4
27 25
Chunk([3 1 3 3 4], 25, 30, #112) Chunk([2 1 2 3 1], 22, 27, #123)
206 303 6
27 25
Chunk([1 3 1 3 3], 24, 29, #109) Chunk([1 2 3 1 1], 23, 28, #110)
191 196 8
27 25
Chunk([3 1 3 3 3], 25, 30, #112) Chunk([2 3 1 1 4], 24, 29, #117)
162 171 10
27 25
Chunk([1 3 3 3 4], 26, 31, #108) Chunk([3 1 1 4 1], 25, 30, #114)
146 157 12
27 26
Chunk([3 3 3 4 2], 27, 32, #113) Chunk([1 1 4 1 4], 26, 31, #107)
156 140 14
27 27
Chunk([2 3 3 3 4], 26, 31, #117) Chunk([1 4 1 4 4], 27, 32, #112)
75 143 16
27 27
Chunk([4 2 3 3 3], 25, 30, #117) Chunk([2 1 4 1 4], 26, 31, #119)
68 139 18
27 27
Chunk([1 4 2 3 3], 24, 29, #116) Chunk([1 4 1 4 2], 27, 32, #110)
76 161 20
27 27
Chunk([4 2 3 3 2], 25, 30, #117) Chunk([4 1 4 1 4], 26, 31, #115)
153 159 22
27 27
Chunk([2 3 3 2 3], 26, 31, #117) Chunk([1 4 1 4 1], 25, 30, #116)
146 162 24
27 27
Chunk([3 3 2 3 1], 27, 32, #117) Chunk([4 1 4 1 1], 26, 31, #115)
168 137 26


  0%|          | 0/12973 [00:11<?, ?it/s]


KeyboardInterrupt: 

In [368]:
cluster = Cluster(db.chunks[0])
print(cluster.right)
r = db.get_candidate(cluster.right)
print(cluster.max_start, cluster.min_end)
print(r[0])
print(cluster.can_append(r[0][1]))

cluster = Cluster(db.chunks[0])
print(cluster.left)
l = db.get_candidate(cluster.left)
print(cluster.max_start, cluster.min_end)
print(l[0])
print(cluster.can_appendleft(l[0][1]))

Chunk([2 3 1 3 3], 24, 29, #144)
28 25
(np.int64(4), Chunk([1 2 3 1 3], 23, 28, #134), 'e28q[2 3 1 3]')
True
Chunk([2 3 1 3 3], 24, 29, #144)
28 25
(np.int64(4), Chunk([1 2 3 1 3], 23, 28, #134), 'e28q[2 3 1 3]')
True


In [324]:
clusters[0].path

deque([Chunk([4 2 2 3 2], 20, 25, #126),
       Chunk([2 3 1 3 3], 24, 29, #144),
       Chunk([3 3 3 4], 28, 32, #115)])

In [295]:
clusters[0].min_end

np.int64(28)

In [285]:
clusters = sorted(clusters, key=lambda c: len(c), reverse=True)
len(clusters[0])

12228

In [286]:
print(len(clusters))
clusters = sorted(clusters, key=lambda c: c.size, reverse=True)

330


In [287]:
n_step = 1000
canvas = np.zeros(db.data.shape)
positions = []
for cluster in tqdm(clusters, desc="Processing Clusters..."):
  start, end, height = cluster.start, cluster.end, cluster.height
  canvas_height = canvas.shape[0]
  start_position = 0
  drew = False

  while start_position < canvas_height and not drew:
    if start_position >= canvas_height:
      canvas_height = start_position + height
      canvas = np.pad(canvas, ((height, 0), (0, 0)))

    if start_position + height > canvas_height:
      canvas_height = start_position + height
      canvas = np.pad(canvas, ((0, height), (0, 0)))

    subcanvas = canvas[start_position : start_position + height, start:end]
    if cluster.can_draw(subcanvas):
      cluster.draw(subcanvas)
      positions.append(start_position)
      drew = True

    start_position += n_step

Processing Clusters...: 100%|██████████| 330/330 [00:14<00:00, 22.96it/s]


In [288]:
print(canvas.shape)
print(np.sum(canvas != 0) / np.prod(canvas.shape))

(1467201, 32)
0.15193278732770765


In [289]:
r = sorted(clusters, key=lambda c: c.height, reverse=True)

In [290]:
r[-5]

Cluster(shape: (100, np.int64(3)), size: 300)

In [291]:
def print_row(row: np.ndarray):
  print("".join([f"{int(x)}" if x != 0 else " " for x in row]))

In [292]:
rows = []
prev_row = np.zeros(canvas.shape[1])
for i in range(0, canvas.shape[0]):
  row = canvas[i]
  if not np.array_equal(row, prev_row) and not np.sum(row) == 0:
    rows.append(row)
    print_row(row)
    prev_row = row

1 433 32 23241334 3234 4324 1343
1 433 32 23241334 3234 4324  343
1143  32 23241334 3234 4324  343
1143  32 232413 413234 4324  343
1143  323232413 413234 4324  343
1143  323232413 413234 4 24  343
1143  323232413 413234 4 24 334 
1143  323232413 4132   4 24 334 
1 431 323232413 4132   4 24 334 
1 431 323232413 413    4 24 334 
1 431 32323 413 413    4 24 334 
1 431 32323 413 413    4 24  341
1 431  2323 413 413    4 24  341
1 431  2323 413 413    4124  341
1 431 32 23 413 413    4124  341
1 431 32 23 41  413    4124  341
1 431 32 23 41  41     4124  341
1 431 32  3 41  41     4124  341
1 431 32  3141  41     4124  341
1 431 32  31 1  41     4124  341
1143  32  31 1  41     4124  341
1143  32  31 1 44      4124  341
1143   22 31 1 44      4124  341
1143   22 31 1344      4124  341
1143   22 31 1344      412   341
1143   22 31 1344      4 2   341
1143   22 31 1344      4 22  341
1 431  22 31 1344      4 22  341
1 431  2243  1344      4 22  341
1 431  2243  1344      4 22 234 
1 431 12 4

In [11]:
len(rows)

627