In [2]:
# !pip install --no-deps bertopic
# !pip install numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml
# !pip install nbformat

In [1]:
from glom import glom, Coalesce
from elasticsearch import Elasticsearch, helpers

# import eland as ed
import pandas as pd
from itertools import chain
from collections import Counter
from pprint import pprint
from bertopic import BERTopic
import numpy as np
import pyecharts

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# es = Elasticsearch(
#     "https://elasticsearch:9200",
#     api_key="YjVwMHpKSUI1aVdrRE5nRHhQN0o6UkxLTGRoUGRSVDZic3NCU2IzNFVnQQ==",
#     verify_certs=False,
# )
es = Elasticsearch(
    "http://host.docker.internal:9200",
    verify_certs=False,
    basic_auth=("elastic", "123456"),
)

In [3]:
es.indices.close(index="dcard")
es.indices.put_settings(
    index="dcard",
    body={
        "index": {
            "default_pipeline": "tencentbac_conan_embedding_pipe",
            "analyze": {
                "max_token_count": 100000,
            },
            "analysis": {
                "analyzer": {
                    "ik_smart_plus": {
                        "type": "custom",
                        "tokenizer": "ik_smart",
                        "filter": ["synonym"],
                    },
                    "ik_max_word_plus": {
                        "type": "custom",
                        "tokenizer": "ik_max_word",
                        "filter": ["synonym"],
                    },
                },
                "filter": {
                    "synonym": {
                        "type": "synonym",
                        "synonyms_path": "analysis-ik/dict/zh_synonym.txt",
                    }
                },
            },
        }
    },
)
es.indices.open(index="dcard")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True})

In [51]:
query_word = "A酸"

body = {
    "_source": [
        "title_aste.*",
        "context_aste.*",
        "date",
        "comments.content_aste.*",
    ],
    "query": {
        "multi_match": {
            "query": query_word,
            "fields": ["title", "context"],
            "minimum_should_match": "50%",
        },
    },
}

# res = es.search(index="dcard", body=body)
res = list(helpers.scan(
    es,
    query=body,
    index="dcard"
))

In [57]:
coalesce_of_path = Coalesce(
        "*._source.context_aste",
        "*._source.title_aste",
        "*._source.comments.*.content_aste",
    )

In [62]:
res_df = pd.DataFrame.from_dict(list(chain.from_iterable(glom(res, coalesce_of_path))))

In [4]:
q_word = "A酸"
body = {
    # "_source": False,
    "_source": ["title", "title_vector", "context", "context_vector", "date"],
    # "fields": ["title_aste.*", "context_aste.*", "date", "comments.content_aste.*"],
    "query": {
        "multi_match": {
            "query": q_word,
            "fields": ["title", "context"],
            "minimum_should_match": "50%",
        },
    },
}
# body = {
#     # "_source": False,
#     "_source": ["title", "title_vector", "context", "context_vector", "date"],
#     # "fields": ["title_aste.*", "context_aste.*", "date", "comments.content_aste.*"],
#     "query": {
#         "match_all": {},
#     },
# }

In [5]:
# tmp = es.search(index="cloned-dcard", body=body)
tmp = helpers.scan(
    es,
    query=body,
    index="cloned-dcard",
)

In [6]:
l_tmp = list(tmp)

In [7]:
tmp_df = pd.DataFrame.from_dict(glom(l_tmp, "*._source"))

In [29]:
from sklearn.feature_extraction.text import CountVectorizer


def tokenize_zh(text: str):
    tokens = es.indices.analyze(index="dcard", analyzer="ik_smart", text=text).body
    tokens = glom(tokens, "tokens.*.token")
    return tokens


vectorizer = CountVectorizer(tokenizer=tokenize_zh)

In [30]:
docs = tmp_df["context"].tolist()
embeddings = np.array(tmp_df["context_vector"].tolist())
timestamps = tmp_df["date"].tolist()


topic_model = BERTopic(
    language="chinese",
    vectorizer_model=vectorizer,
    calculate_probabilities=True,
    min_topic_size=round(len(docs) * 0.05),
)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [44]:
topics_over_time = topic_model.topics_over_time(
    docs, timestamps, nr_bins=20, datetime_format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
topics_over_time

In [46]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [None]:
# topic_model.visualize_documents(docs, embeddings=embeddings, hide_document_hover=True, hide_annotations=True)
# topic_model.visualize_topics()

In [None]:
# new_topics = topic_model.reduce_outliers(
#     documents=docs,
#     topics=topics,
#     embeddings=embeddings,
#     strategy="embeddings",
# )
# topic_model.update_topics(docs, topics=new_topics)

In [None]:
df = topic_model.get_document_info(docs)

In [None]:
df

In [None]:
coalesce_of_path = Coalesce(
    "hits.hits.*._source.context_aste",
    "hits.hits.*._source.title_aste",
    "hits.hits.*._source.comments.*.content_aste",
)
# glom(tmp.body, coalesce_of_path)
# glom(tmp.body, "hits.hits.*.fields.context_aste.*")
df = pd.DataFrame.from_dict(list(chain.from_iterable(glom(tmp.body, coalesce_of_path))))

In [None]:
c_item = Counter(df["a"].to_list()).items()

In [None]:
c_item

In [None]:
from pyecharts.charts import WordCloud
from pyecharts import options as opts
from streamlit_echarts import st_pyecharts
import streamlit as st

In [None]:
wc = (
    WordCloud(opts.InitOpts(chart_id="ws"))
    .add(series_name="A酸", data_pair=c_item)
    .add_js_funcs(
        """
    chart_ws.on('click', function (params) { console.log(params); })
    """
    )
)

In [None]:
import json

In [None]:
with open("wc.json", "w") as f:
    f.write(wc.dump_options())

In [None]:
json.decoder.JSONDecoder.decode(wc.dump_options())

In [None]:
wc.dump_options_with_quotes()

In [None]:
wc.render_notebook()

In [None]:
tmp = es.search(
    index="dcard",
    body=body,
    filter_path=["hits.hits.inner_hits.*.hits.hits._source.*", "hits.total.value"],
)
tmp = list(chain.from_iterable(glom(tmp.body, "hits.hits.**.hits.hits.*._source")))

In [None]:
tag_counter = Counter(tmp["t"])

pos_o_counter = Counter(tmp.o[tmp.t == "POS"])
neg_o_counter = Counter(tmp.o[tmp.t == "NEG"])
nat_o_counter = Counter(tmp.o[tmp.t == "NAT"])

In [None]:
tag_counter.items()

In [None]:
def aop_df_2_data(aop_df):
    tag_counter = Counter(aop_df["t"])
    return [
        {
            "value": v,
            "name": k,
            "children": [
                {"value": c, "name": w}
                for w, c in Counter(aop_df.o[aop_df.t == k]).items()
            ],
        }
        for k, v in tag_counter.items()
    ]

In [None]:
aop_df_2_data(tmp)