In [1]:
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_DEV=1

env: ANYWIDGET_DEV=1


In [2]:
import numpy as np
from sequilt import Sequilt
from sequilt.model import EventGraph, EventCanvas, SequletModel, LabelModel

In [3]:
from datasets import load_dataset
from sequilt.data import get_ids, get_tokenizer

ds = load_dataset("neuralbioinfo/bacterial_promoters")
tokenizer = get_tokenizer(type="dna", k=1)
ids, tokens = get_ids(
  ds["test_multispecies"]["segment"], tokenizer, max_tokens=32
)
labels = [
  LabelModel(value=value, name=name)
  for value, name in tokenizer._id_to_token.items()
  if value != 0
]
labels = [
  labels[2], labels[1], labels[0], labels[3]
]

100%|██████████| 22582/22582 [00:01<00:00, 17228.21it/s]


In [4]:
from datasets import load_dataset
from sequilt.data import get_ids, get_tokenizer, get_featured_ids

ds = load_dataset("ajaykarthick/imdb-movie-reviews")
tokenizer = get_tokenizer(type="language")
ids, tokens = get_ids(
  ds['test']["review"], tokenizer, max_tokens=32
)
featured_ids = get_featured_ids(ids, tokenizer, "count", n_features=10)
feature_mask = np.isin(ids, featured_ids)
ids_lang = np.where(feature_mask, ids, 0)
labels = [
  LabelModel(value=id, name=tokenizer.id_to_token(id))
  for id in featured_ids
]

100%|██████████| 10000/10000 [00:05<00:00, 1967.99it/s]


In [8]:
G = EventGraph(ids_lang)

canvas = EventCanvas(width=ids.shape[1], initial_height=ids.shape[0] * 2)

for event1, event2, cooccurence in G.sorted_edges:
  sequlet = SequletModel([event1, event2])
  print(event1, event2, cooccurence)
  canvas.draw_sequlet(sequlet)
  G.remove_events_from([event1, event2])

Event(Position=0, Value=59, # Occurences=522) Event(Position=1, Value=112, # Occurences=67) 16
Event(Position=0, Value=964, # Occurences=193) Event(Position=1, Value=131, # Occurences=57) 16
Event(Position=1, Value=59, # Occurences=545) Event(Position=2, Value=7, # Occurences=82) 16
Event(Position=2, Value=964, # Occurences=67) Event(Position=3, Value=131, # Occurences=52) 13
Event(Position=1, Value=339, # Occurences=72) Event(Position=2, Value=59, # Occurences=468) 12
Event(Position=4, Value=964, # Occurences=56) Event(Position=5, Value=131, # Occurences=63) 10
Event(Position=3, Value=964, # Occurences=60) Event(Position=4, Value=131, # Occurences=51) 9
Event(Position=1, Value=5, # Occurences=308) Event(Position=2, Value=30, # Occurences=175) 8
Event(Position=2, Value=339, # Occurences=37) Event(Position=3, Value=59, # Occurences=334) 8
Event(Position=5, Value=964, # Occurences=59) Event(Position=6, Value=131, # Occurences=59) 8
Event(Position=0, Value=5, # Occurences=251) Event(Posit

In [10]:
print(len(G))

for node in G.nodes:
  sequlet = SequletModel([node])
  canvas.draw_sequlet(sequlet)

32


In [11]:
w = Sequilt(sequlets=canvas.sequlet_rects, labels=labels, width=800, height=800)
w

Sequilt(height=800, labels=[{'value': 59, 'name': 'movie'}, {'value': 5, 'name': 'film'}, {'value': 30, 'name'…