In [3]:
%load_ext autoreload
%autoreload 2

In [192]:
import numpy as np
from tqdm import tqdm

from seq.model import Chunk, ChunkDatabase, Cluster

np.random.seed(42)

In [384]:
data = np.random.randint(1, 5, (10_000, 32))

In [403]:
chunk_view = np.lib.stride_tricks.sliding_window_view(data, (1, 10)).squeeze(
  axis=2
)
org_chunk_view = chunk_view.copy()
hashes = np.apply_along_axis(lambda x: hash(tuple(x)), 2, chunk_view)

In [411]:
unique, counts = np.unique(hashes, return_counts=True)
print(unique.shape)
unique = unique[counts >= 100]
len(unique)

(206589,)


0

In [720]:
shape = (100_000, 32)
d = np.random.randint(1, 5, shape)

result = np.zeros((10, d.shape[0], d.shape[1]))

for window_size in range(1, 11):
  if window_size == 1:
    result[window_size - 1, :] = d
  else:
    view = np.lib.stride_tricks.sliding_window_view(
      d, (1, window_size)
    ).squeeze(2)
    hashes = np.apply_along_axis(lambda x: hash(tuple(x)), 2, view)
    result[window_size - 1, :, : -window_size + 1] = hashes
print(result.shape)

(10, 100000, 32)


In [721]:
min_support = 100
chunked = np.zeros(shape)
res = result.copy()
for window_size in range(10, 0, -1):
  uniques, counts = np.unique(res[window_size - 1], return_counts=True)
  print(
    f"window_size: {window_size}, Processed: {(1-np.count_nonzero(chunked == 0) / chunked.size) * 100:.2f}%"
  )
  while len(uniques) > 0:
    uniques, counts = np.unique(res[window_size - 1], return_counts=True)
    # sort by counts
    mask = (counts >= min_support) & (uniques != 0)
    uniques = uniques[mask]
    counts = counts[mask]
    order = np.argsort(counts)[::-1]
    uniques = uniques[order]
    counts = counts[order]

    if len(uniques) == 0:
      break
    mask_single = res[window_size - 1] == uniques[0]
    mask = np.zeros_like(res[window_size - 1], dtype=bool)

    for j in range(0, window_size):
      if j == window_size - 1:
        if window_size == 1:
          mask[:, j:] |= mask_single[:, :]
        else:
          mask[:, j:] |= mask_single[:, : -window_size + 1]
      else:
        mask[:, j : -window_size + j + 1] |= mask_single[:, : -window_size + 1]

    res[:, mask] = 0
    chunked[mask] = uniques[0]

window_size: 10, Processed: 100.00%
window_size: 9, Processed: 100.00%
window_size: 8, Processed: 100.00%
window_size: 7, Processed: 100.00%


KeyboardInterrupt: 

In [381]:
db = ChunkDatabase(data, threshold=100, max_chunk_length=6)
db

100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


ChunkDatabase Information
Data Shape: (10000, 32)
Threshold: 100
Max Chunk Length: 6
Number of Chunks: 1920
Number of Chunks by Length
3: 1920

In [382]:
processed_chunks = set()
clusters: list[Cluster] = []

for chunk in tqdm(db.chunks):
  if chunk in processed_chunks:
    continue

  cluster = Cluster([chunk, db.get_candidate(chunk)[0]])
  processed_chunks.add(cluster.left)
  processed_chunks.add(cluster.right)
  left_candidate = [
    c
    for c in db.get_candidate(cluster.left)
    if c not in processed_chunks and cluster.can_appendleft(c)
  ]

  right_candidate = [
    c
    for c in db.get_candidate(cluster.right)
    if c not in processed_chunks and cluster.can_append(c)
  ]

  while True:
    if len(left_candidate):
      cluster.appendleft(left_candidate[0])
      processed_chunks.add(left_candidate[0])
    if len(right_candidate):
      cluster.append(right_candidate[0])
      processed_chunks.add(right_candidate[0])

    left_candidate = [
      c
      for c in db.get_candidate(cluster.left)
      if c not in processed_chunks and cluster.can_appendleft(c)
    ]

    right_candidate = [
      c
      for c in db.get_candidate(cluster.right)
      if c not in processed_chunks and cluster.can_append(c)
    ]

    if len(left_candidate) == 0 and len(right_candidate) == 0:
      clusters.append(cluster)
      break

100%|██████████| 1920/1920 [00:01<00:00, 1510.33it/s]


In [383]:
clusters

[Cluster(shape: (250385, np.int64(32)), size: 751155),
 Cluster(shape: (19164, np.int64(16)), size: 57492),
 Cluster(shape: (3335, np.int64(11)), size: 10005),
 Cluster(shape: (1977, np.int64(9)), size: 5931),
 Cluster(shape: (10034, np.int64(18)), size: 30102),
 Cluster(shape: (2883, np.int64(12)), size: 8649),
 Cluster(shape: (459, np.int64(4)), size: 1377),
 Cluster(shape: (2129, np.int64(9)), size: 6387),
 Cluster(shape: (330, np.int64(4)), size: 990),
 Cluster(shape: (1008, np.int64(6)), size: 3024),
 Cluster(shape: (729, np.int64(6)), size: 2187),
 Cluster(shape: (316, np.int64(4)), size: 948),
 Cluster(shape: (311, np.int64(4)), size: 933),
 Cluster(shape: (457, np.int64(4)), size: 1371),
 Cluster(shape: (730, np.int64(6)), size: 2190),
 Cluster(shape: (882, np.int64(5)), size: 2646),
 Cluster(shape: (569, np.int64(5)), size: 1707),
 Cluster(shape: (454, np.int64(6)), size: 1362),
 Cluster(shape: (307, np.int64(4)), size: 921),
 Cluster(shape: (442, np.int64(4)), size: 1326),
 C

In [324]:
clusters[0].path

deque([Chunk([4 2 2 3 2], 20, 25, #126),
       Chunk([2 3 1 3 3], 24, 29, #144),
       Chunk([3 3 3 4], 28, 32, #115)])

In [295]:
clusters[0].min_end

np.int64(28)

In [285]:
clusters = sorted(clusters, key=lambda c: len(c), reverse=True)
len(clusters[0])

12228

In [286]:
print(len(clusters))
clusters = sorted(clusters, key=lambda c: c.size, reverse=True)

330


In [287]:
n_step = 1000
canvas = np.zeros(db.data.shape)
positions = []
for cluster in tqdm(clusters, desc="Processing Clusters..."):
  start, end, height = cluster.start, cluster.end, cluster.height
  canvas_height = canvas.shape[0]
  start_position = 0
  drew = False

  while start_position < canvas_height and not drew:
    if start_position >= canvas_height:
      canvas_height = start_position + height
      canvas = np.pad(canvas, ((height, 0), (0, 0)))

    if start_position + height > canvas_height:
      canvas_height = start_position + height
      canvas = np.pad(canvas, ((0, height), (0, 0)))

    subcanvas = canvas[start_position : start_position + height, start:end]
    if cluster.can_draw(subcanvas):
      cluster.draw(subcanvas)
      positions.append(start_position)
      drew = True

    start_position += n_step

Processing Clusters...: 100%|██████████| 330/330 [00:14<00:00, 22.96it/s]


In [288]:
print(canvas.shape)
print(np.sum(canvas != 0) / np.prod(canvas.shape))

(1467201, 32)
0.15193278732770765


In [289]:
r = sorted(clusters, key=lambda c: c.height, reverse=True)

In [290]:
r[-5]

Cluster(shape: (100, np.int64(3)), size: 300)

In [291]:
def print_row(row: np.ndarray):
  print("".join([f"{int(x)}" if x != 0 else " " for x in row]))

In [292]:
rows = []
prev_row = np.zeros(canvas.shape[1])
for i in range(0, canvas.shape[0]):
  row = canvas[i]
  if not np.array_equal(row, prev_row) and not np.sum(row) == 0:
    rows.append(row)
    print_row(row)
    prev_row = row

1 433 32 23241334 3234 4324 1343
1 433 32 23241334 3234 4324  343
1143  32 23241334 3234 4324  343
1143  32 232413 413234 4324  343
1143  323232413 413234 4324  343
1143  323232413 413234 4 24  343
1143  323232413 413234 4 24 334 
1143  323232413 4132   4 24 334 
1 431 323232413 4132   4 24 334 
1 431 323232413 413    4 24 334 
1 431 32323 413 413    4 24 334 
1 431 32323 413 413    4 24  341
1 431  2323 413 413    4 24  341
1 431  2323 413 413    4124  341
1 431 32 23 413 413    4124  341
1 431 32 23 41  413    4124  341
1 431 32 23 41  41     4124  341
1 431 32  3 41  41     4124  341
1 431 32  3141  41     4124  341
1 431 32  31 1  41     4124  341
1143  32  31 1  41     4124  341
1143  32  31 1 44      4124  341
1143   22 31 1 44      4124  341
1143   22 31 1344      4124  341
1143   22 31 1344      412   341
1143   22 31 1344      4 2   341
1143   22 31 1344      4 22  341
1 431  22 31 1344      4 22  341
1 431  2243  1344      4 22  341
1 431  2243  1344      4 22 234 
1 431 12 4

In [11]:
len(rows)

627