In [14]:
from elasticsearch import Elasticsearch, helpers
import eland as ed
from glom import glom
import pandas as pd

# with open("config.yaml") as f:
#     config = yaml.safe_load(f)
idx_name = "docs"
es = Elasticsearch("http://host.docker.internal:9200", verify_certs=False, basic_auth=("elastic", "123456"))

In [15]:
def query_vec(query_word):
    body = {
        "_source": ["title", "title_vector", "context", "context_vector", "date"],
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["title", "context"],
                "minimum_should_match": "50%",
            },
        },
    }
    res = list(
        helpers.scan(
            es,
            query=body,
            index=idx_name,
        )
    )
    assert len(res) != 0
    vec_df = pd.DataFrame.from_dict(glom(res, "*._source"))
    return vec_df[vec_df["context"].notnull() & vec_df["context_vector"].notnull()]

In [16]:
query_vec("台灣")

Unnamed: 0,date,context,title_vector,context_vector,title
0,2024-11-06 13:31:51.000,是這樣定義的嗎？ 那我們用功耗來對照看看 https://i.imgur.com/0QjnV...,"[0.375591516494751, -0.8211070895195007, -0.30...","[-0.22514572739601135, -0.720793604850769, -0....",Re: [情報] 極客灣：英特爾酷睿Ultra 200S評測：無
1,2024-11-05 15:46:46.000,已買/未買/已付訂金（元）： 預算/用途：預算15k / 用途文書、上網、看yt CPU (...,"[-0.22785982489585876, -1.2610244750976562, -0...","[0.029860083013772964, -1.5114549398422241, -0...",[菜單] 15k文書影音機
2,2024-11-06 01:10:07.000,已買/未買/已付訂金（元）：未買 預算/用途：50K左右/全白/玩魔獸世界跟魔物獵人荒野 C...,"[-0.4163767695426941, -0.5726318359375, -0.701...","[-0.38636189699172974, -1.1678825616836548, -0...",[菜單] 55K全白遊戲機
3,2024-11-05 13:46:07.000,英特爾酷睿Ultra 200S評測：無藥可救 https://youtu.be/TFu2io...,"[0.5771134495735168, -0.7114483714103699, -0.0...","[0.12367042899131775, -1.107509970664978, -0.1...",[情報] 極客灣：英特爾酷睿Ultra 200S評測：無
4,2024-11-05 19:57:22.000,英特爾創下史上最大單季虧損 英特爾在Q3財報中認列159億美元的減值損失，以及28億美元的重...,"[0.47296786308288574, -0.36625590920448303, -0...","[0.6585003733634949, -0.947905421257019, -0.91...",[情報] 英特爾創下史上最大單季虧損
...,...,...,...,...,...
5399,2024-10-24 09:31:09.418,今年9月訂了三星65吋電視，10月安裝後開機沒多久螢幕顏色整個走鐘，打去客服搞了半天才安排工...,"[-0.7853200435638428, -0.8398605585098267, -0....","[-0.4827658236026764, -0.8651880025863647, -1....",三星電視超雷售服
5400,2024-11-01 02:28:06.690,這個東西是不是無腦買就好 34吋 21：9 2k 1800R 應該很適合賽車遊戲 什麼Mom...,"[-0.1196647435426712, -0.9279241561889648, -0....","[-0.765488862991333, -1.190038800239563, -0.35...",1萬元的OLED曲面
5401,2024-10-20 07:32:26.079,大家好我們是世新大學的學生 目前在準備一場商業競賽 想徵集各位大學生對於電子書平台/閱讀器的...,"[-1.0094527006149292, -1.3332679271697998, -1....","[-0.2539275288581848, -0.7227120995521545, -0....",#徵集問卷
5402,2024-11-08 18:16:01.262,"最近想買電競筆電，預算大約$35000左右，爬完問發現很推 ""ASUS TUF Gaming...","[-0.4873010516166687, -0.7593896985054016, -0....","[-0.5640811920166016, -1.3724745512008667, 0.1...",電競筆電詢問 - 魔物獵人荒野


In [None]:


def infer(text: str) -> list[AOP_dict]:
    doc = EBS_Dict(nlp(text))
    return doc._.aspect_sentiment_triplets

def aste_infer(texts: list[str]) -> list[list[AOP_dict]]:
    return [infer(text) if text else None for text in texts]


def comments_aste_infer(comments: list[dict]) -> list[dict]:
    comments = [
        {
            "content": comment["content"],
            "content_aste": infer(comment["content"])
        }
        for comment in comments
        if comment["content"]
    ]
    return comments

In [None]:
backoff_count = 1
while True:
    ed_data = (
        ed.DataFrame(
            es,
            idx_name,
            columns=[
                "status_code",
                "link",
                "title",
                "title_aste",
                "date",
                "context",
                "context_aste",
                "comments",
            ],
        )
        .query("status_code == 'UN_ASTE'")
        .head(5)
    )
    if not ed_data.empty:
        backoff_count = 1
        print("Processing data")
        try:
            pd_data = ed.eland_to_pandas(ed_data)
            pd_data["status_code"] = "ASTE_BY_TRANDITIONAL_NLP"
            pd_data["title_aste"] = aste_infer(pd_data["title"].values)
            pd_data["context_aste"] = aste_infer(pd_data["context"].values)
            pd_data["comments"] = pd_data["comments"].apply(comments_aste_infer)
        except Exception as e:
            print("Error in processing data")
            logging.error("Error in processing data")
            logging.exception(e)
            logging.error("ID: %s", str(list(pd_data.index)))
            break

        print("Updating data")

        # buf = pd_data.to_json(date_format="iso")
        # parsed = loads(buf)
        # print(parsed)
        # break

        try:
            # ed.pandas_to_eland(pd_data, client, "dcard", es_if_exists="append", es_type_overrides={'comments':'nested', 'title_aste':'nested', 'context_aste':'nested'})
            for idx, data in pd_data.iterrows():
                es.update(
                    index="docs",
                    id=idx,
                    body={
                        "doc": {
                            "status_code": data["status_code"],
                            "title_aste": data["title_aste"],
                            "context_aste": data["context_aste"],
                            "comments": data["comments"],
                        },
                    },
                )
        except Exception as e:
            print("Error in updating data")
            logging.error("Error in updating data")
            logging.exception(e)
            break

        print("Data processed")

    else:
        logging.info("No data to process")
        time.sleep(5**backoff_count)
        if backoff_count < 4:
            backoff_count += 1


In [1]:
import json

In [2]:
with open("absa_dataset.json", "r") as f:
    json_data = json.load(f)