In [1]:
from glom import glom
from bertopic import BERTopic
from elasticsearch import helpers, Elasticsearch
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
es = Elasticsearch(
    "http://host.docker.internal:9200",
    verify_certs=False,
    basic_auth=("elastic", "123456"),
)
index = "docs"

In [3]:
es.info()

ObjectApiResponse({'name': 'f7f569dd80ae', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'CW97Ie5ZSPKXZafOAxIPeQ', 'version': {'number': '8.15.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f97532e680b555c3a05e73a74c28afb666923018', 'build_date': '2024-10-09T22:08:00.328917561Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
def query_vec(query_word):
    body = {
        "_source": ["title", "title_vector", "context", "context_vector", "date"],
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["title", "context"],
                "minimum_should_match": "50%",
            },
        },
    }
    res = list(
        helpers.scan(
            es,
            query=body,
            index=index,
        )
    )
    assert len(res) != 0
    vec_df = pd.DataFrame.from_dict(glom(res, "*._source"))
    return vec_df

In [24]:
def init_topic_model(num_of_docs: int):
    def tokenize_zh(text: str):
        tokens = es.indices.analyze(
            index="dcard", analyzer="ik_smart", text=text
        ).body
        tokens = glom(tokens, "tokens.*.token")
        return tokens

    n_components = 5
    umap_model = UMAP(
        n_neighbors=max(
            2, round(num_of_docs * 0.2)
        ),  # n_neighbors must be greater than 1
        n_components=n_components,
        min_dist=0.0,
        metric="cosine",
        init=(
            "spectral" if num_of_docs > n_components + 1 else "random"
        ),  # lmcinnes umap issue #201
    )

    topic_model = BERTopic(
        language="chinese",
        umap_model=umap_model,
        vectorizer_model=CountVectorizer(tokenizer=tokenize_zh),
        min_topic_size=max(
            2, round(num_of_docs * 0.05)
        ),  # Min cluster size must be greater than one
    )
    return topic_model

In [25]:
def fit_topic_model(vec_df):
    docs = vec_df["context"].tolist()
    embeddings = np.array(vec_df["context_vector"].tolist())
    timestamps = vec_df["date"].tolist()

    topic_model = init_topic_model(len(docs))
    topic_model.fit(docs, embeddings)
    topics_over_time = topic_model.topics_over_time(
        docs, timestamps, nr_bins=20, datetime_format="%Y-%m-%d %H:%M:%S.%f"
    )

    return topic_model, topics_over_time

In [26]:
def gen_dtm(query_word):
    vec_df = query_vec(query_word)
    _, topics_over_time_df = fit_topic_model(vec_df)
    return topics_over_time_df

In [27]:
gen_dtm("網紅")

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"金針菇, 瑄, 林, 觀看, 更新",1,2024-11-04 21:28:04.461035
1,-1,"是想, 為何, 秘, po, 家",1,2024-11-04 22:24:58.756250
2,-1,"粉, 家, 根本, 腦, 超",1,2024-11-05 00:16:33.452750
3,-1,"家, 影片, 頻道, 邪教, 課程",1,2024-11-05 06:47:04.890500
4,-1,"課程, 攻擊, 一直, 議員, 現在",1,2024-11-05 10:30:14.283500
5,-1,"傑, 揚, 講話, youtube, youtube.com",1,2024-11-05 15:09:11.024750
