In [4]:
%load_ext autoreload
%autoreload 2

from sequilt import Sequilt
from sequilt.model import EventGraph, Sequlet, LabelModel
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# from datasets import load_dataset
# from sequilt.data import get_ids, get_tokenizer

# ds = load_dataset("neuralbioinfo/bacterial_promoters")
# tokenizer = get_tokenizer(type="dna", k=1)
# ids, tokens = get_ids(
#   ds["test_multispecies"]["segment"], tokenizer, max_tokens=32
# )
# labels = [
#   LabelModel(value=value, name=name)
#   for value, name in tokenizer._id_to_token.items()
#   if value != 0
# ]

# labels = sorted(labels, key=lambda x: x.name)

In [2]:
import re
from typing import List
from dataclasses import dataclass
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import sent_tokenize
import nltk

# NLTK 데이터 다운로드
nltk.download('punkt')

@dataclass
class SentenceTopicModel:
    sentence: str
    topic: int

def preprocess_text(text: str) -> str:
    # HTML 태그 제거
    text = re.sub('<[^>]*>', '', text)
    # 특수 문자 제거
    text = re.sub('[^a-zA-Z\s]', '', text)
    # 소문자 변환
    text = text.lower()
    return text

def remove_stopwords(text: str, stop_words: List[str]) -> str:
    return ' '.join([word for word in text.split() if word not in stop_words])

def split_paragraphs_to_sentences(paragraphs: List[str]) -> List[List[str]]:
    return [sent_tokenize(paragraph) for paragraph in paragraphs]

def perform_topic_modeling(sentences: List[List[str]], num_topics: int, stop_words: List[str]) -> List[List[SentenceTopicModel]]:
    # 모든 문장을 하나의 리스트로 평탄화
    all_sentences = [sentence for paragraph in sentences for sentence in paragraph]
    
    # 전처리 및 불용어 제거
    preprocessed_sentences = [remove_stopwords(preprocess_text(sentence), stop_words) for sentence in all_sentences]
    
    # CountVectorizer를 사용하여 텍스트를 벡터화
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(preprocessed_sentences)
    
    # LDA 모델 학습
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_output = lda_model.fit_transform(doc_term_matrix)
    
    # 각 문장에 topic 할당
    sentence_topics = [SentenceTopicModel(sentence=sentence, topic=topic.argmax()) 
                       for sentence, topic in zip(all_sentences, lda_output)]
    
    # 결과를 원래 구조(List[List[SentenceTopicModel]])로 재구성
    result = []
    idx = 0
    for paragraph in sentences:
        paragraph_result = []
        for _ in range(len(paragraph)):
            paragraph_result.append(sentence_topics[idx])
            idx += 1
        result.append(paragraph_result)
    
    return result

def main(paragraphs: List[str], num_topics: int, stop_words: List[str]) -> List[List[SentenceTopicModel]]:
    sentences = split_paragraphs_to_sentences(paragraphs)
    return perform_topic_modeling(sentences, num_topics, stop_words)

# 사용 예시
if __name__ == "__main__":
    paragraphs = [
        "This is the first paragraph. It contains two sentences.",
        "This is the second paragraph. It has three sentences. The topic might be different.",
        "This is the third paragraph. It's about a new topic. It has multiple sentences too."
    ]
    num_topics = 2
    stop_words = ['is', 'the', 'a', 'an', 'in', 'on', 'at', 'for', 'to', 'of']
    
    result = main(paragraphs, num_topics, stop_words)
    
    for i, paragraph in enumerate(result):
        print(f"Paragraph {i + 1}:")
        for sentence in paragraph:
            print(f"  Sentence: '{sentence.sentence}', Topic: {sentence.topic}")
        print()

Paragraph 1:
  Sentence: 'This is the first paragraph.', Topic: 0
  Sentence: 'It contains two sentences.', Topic: 1

Paragraph 2:
  Sentence: 'This is the second paragraph.', Topic: 0
  Sentence: 'It has three sentences.', Topic: 1
  Sentence: 'The topic might be different.', Topic: 1

Paragraph 3:
  Sentence: 'This is the third paragraph.', Topic: 0
  Sentence: 'It's about a new topic.', Topic: 1
  Sentence: 'It has multiple sentences too.', Topic: 1



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasonchoi3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


input: 텍스트 문단 데이터세트 list[str]
process:

1. 문단을 문장 단위로 나눔 (list[str] -> list[list[str]])
2. 문장 단위로 LDA Topic Modeling을 수행함 (Scikit-learn 이용)
2-1. 이때, HTML 태그와 Stopwords를 삭제함
3. 각 문장에 대해 Topic을 할당함 (list[list[str]] -> list[list[int]])

return: 
아래와 같은 Schmea로 결과를 반환함. (list[list[SentenceTopicModel]])

```python
@dataclass
class SentenceTopicModel:
  sentence: str
  topic: int
```

In [11]:
# Language Dataset
from datasets import load_dataset
from sequilt.data import get_ids, get_tokenizer, get_featured_ids

ds = load_dataset("ajaykarthick/imdb-movie-reviews")
tokenizer = get_tokenizer(type="language")
ids, tokens = get_ids(
  ds['test']["review"], tokenizer, max_tokens=32
)
featured_ids = get_featured_ids(ids, tokenizer, "tf-idf", n_features=20)
feature_mask = np.isin(ids, featured_ids)
ids = np.where(feature_mask, ids, 0)
labels = [
  LabelModel(value=id, name=tokenizer.id_to_token(id))
  for id in featured_ids
]

100%|██████████| 10000/10000 [00:04<00:00, 2070.81it/s]


In [12]:
G = EventGraph(ids)
sequilt = Sequilt(sequence_length=ids.shape[1], n_sequences=ids.shape[0], labels=labels)
sequilt

Sequilt(labels=[{'value': 59, 'name': 'movie'}, {'value': 5, 'name': 'film'}, {'value': 30, 'name': 'one'}, {'…

In [13]:
# Draw edges
from time import sleep
G = EventGraph(ids)
sequilt.sequlets = []

for id, (event1, event2, cooccurence) in enumerate(G.sorted_edges):
  if cooccurence < 5:
    break
  print(event1, event2, cooccurence)
  sequlet = Sequlet(id, [event1, event2])
  sleep(0.15)
  sequilt.draw_sequlet(sequlet)
  G.remove_events_from([event1, event2])
  
# # Draw nodes
# for event in G.events:
#   sequlet = Sequlet(len(sequilt), [event])
#   sequilt.draw_sequlet(sequlet)
#   sleep(0.15)
#   # G.remove_event(event)

Event(Position=0, Value=107, # Occurences=93) Event(Position=1, Value=59, # Occurences=545) 18
Event(Position=0, Value=59, # Occurences=522) Event(Position=1, Value=112, # Occurences=67) 16
Event(Position=0, Value=964, # Occurences=193) Event(Position=1, Value=131, # Occurences=57) 16
Event(Position=0, Value=30, # Occurences=249) Event(Position=1, Value=103, # Occurences=47) 13
Event(Position=2, Value=964, # Occurences=67) Event(Position=3, Value=131, # Occurences=52) 13
Event(Position=1, Value=339, # Occurences=72) Event(Position=2, Value=5, # Occurences=326) 12
Event(Position=1, Value=107, # Occurences=50) Event(Position=2, Value=59, # Occurences=468) 11
Event(Position=4, Value=964, # Occurences=56) Event(Position=5, Value=131, # Occurences=63) 10
Event(Position=0, Value=5, # Occurences=251) Event(Position=1, Value=12, # Occurences=64) 9
Event(Position=0, Value=172, # Occurences=60) Event(Position=1, Value=5, # Occurences=308) 9
Event(Position=2, Value=103, # Occurences=99) Event(Pos