In [1]:
from elasticsearch import Elasticsearch, helpers
from glom import glom, Coalesce
from itertools import chain, combinations
from pprint import pprint as print

import datetime
import pandas as pd
import yaml

In [2]:
with open("config.yaml", "r", encoding="utf-8") as stream:
    config = yaml.safe_load(stream)

es = Elasticsearch(
    "http://127.0.0.1:9200",
    verify_certs=False,
    basic_auth=("elastic", "123456"),
)

index = config["index"]

In [3]:
es.info()

ObjectApiResponse({'name': 'c4a7017a324d', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'CW97Ie5ZSPKXZafOAxIPeQ', 'version': {'number': '8.15.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f97532e680b555c3a05e73a74c28afb666923018', 'build_date': '2024-10-09T22:08:00.328917561Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
def query_from_es(
    query_words: list[str],
    query_range: tuple[datetime.date, datetime.date],
    selection: list[str],
    selection_broad: list[str],
    tern_seatch: bool,
):
    body = {
        "_source": [
            # "platform",
            # "borad",
            "title_aste.*",
            "context_aste.*",
            "date",
            "comments.content_aste.*",
        ],
        "query": {
            "bool": {
                "filter": [
                    {
                        "bool": {
                            "should": [],
                        }
                    },
                    {"terms": {"platform": selection}},
                    {"terms": {"borad": selection_broad}},
                    {
                        "range": {
                            "date": {
                                "gte": query_range[0].isoformat(),
                                "lte": query_range[1].isoformat(),
                                "time_zone": "+08:00",
                                "format": "yyyy-MM-dd",
                            },
                        }
                    },
                ]
            }
        },
    }
    if tern_seatch:
        query_words = query_words.split()
        body["query"]["bool"]["filter"][0]["bool"]["should"] = [
            {
                "terms_set": {
                    "title_token.keyword": {
                        "terms": query_words,
                        "minimum_should_match_script": {"source": "params.num_terms"},
                    }
                }
            },
            {
                "terms_set": {
                    "context_token.keyword": {
                        "terms": query_words,
                        "minimum_should_match_script": {"source": "params.num_terms"},
                    }
                }
            },
        ]
    else:
        body["query"]["bool"]["filter"][0]["bool"]["should"] = [
            {
                "multi_match": {
                    "query": " ".join(query_words),
                    "fields": ["title", "context"],
                    "minimum_should_match": "3<75%",
                }
            }
        ]

    # print(body)

    res = list(helpers.scan(es, query=body, index=index, size=5000))
    return res

    # assert len(res) != 0
    # coalesce_of_path = Coalesce(
    #     "*._source.context_aste",
    #     "*._source.title_aste",
    #     "*._source.comments.*.content_aste",
    # )
    # aop_df = pd.DataFrame.from_dict(
    #     list(chain.from_iterable(glom(res, coalesce_of_path)))
    # )
    # aop_df["t"] = aop_df["p"].map(
    #     lambda x: "POS" if x >= 6 else "NAT" if x > 4 else "NEG"
    # )
    # return aop_df

In [5]:
today = datetime.date.today()
last_month = today - datetime.timedelta(days=30)

In [6]:
boards = []
platform = []
for k, v in config["platforms"].items():
    platform.append(k)
    boards += v["boards"]

In [7]:
tmp = query_from_es("台灣", (last_month, today), platform, boards, True)

In [8]:
coalesce_of_path = Coalesce(
    "_source.context_aste",
    "_source.title_aste",
    "_source.comments.*.content_aste",
)

In [9]:
len(tmp)

11035

In [10]:
list_dict = []
for idx, i in enumerate(tmp):
    try:
        g_tmp = glom(i, coalesce_of_path)
        if isinstance(g_tmp[0], list):
            g_tmp = chain.from_iterable(g_tmp)
        for j in g_tmp:
            list_dict.append(
                {
                    "id": i["_id"],
                    "date": i["_source"]["date"],
                    "a": j["a"],
                    "o": j["o"],
                    "p": j["p"],
                }
            )
    except Exception as e:
        pass
        # print(e)
        # print(i)
        # print(g_tmp)

In [152]:
len(list_dict)

334969

In [11]:
aop_df = pd.DataFrame.from_dict(list_dict)
aop_df["date"] = aop_df["date"].astype("datetime64[ns]")

In [12]:
aop_df["t"] = aop_df["p"].map(
    lambda x: "POS" if x >= 6 else "NAT" if x > 4 else "NEG"
)
# aop_df["t"] = aop_df["p"].map(lambda x: 1 if x >= 6 else 0 if x > 4 else -1)

In [None]:
aop_df

In [18]:
list(aop_df["a"].value_counts().to_dict().items())

[('我', 8469),
 ('人', 6813),
 ('你', 5661),
 ('台灣', 5211),
 ('他', 4554),
 ('川普', 2766),
 ('政府', 1955),
 ('美國', 1913),
 ('我們', 1882),
 ('自己', 1795),
 ('大家', 1746),
 ('這', 1526),
 ('他們', 1445),
 ('中國', 1415),
 ('她', 1254),
 ('民進黨', 1082),
 ('柯文哲', 985),
 ('誰', 982),
 ('台灣人', 982),
 ('者', 919),
 ('來源', 918),
 ('標題', 915),
 ('公司', 806),
 ('記者', 786),
 ('國家', 778),
 ('民眾', 749),
 ('黃國昌', 745),
 ('賴清德', 736),
 ('總統', 720),
 ('內文', 663),
 ('中共', 662),
 ('民眾黨', 655),
 ('內容', 646),
 ('媒體', 593),
 ('颱風', 566),
 ('時間', 558),
 ('時候', 556),
 ('人民', 550),
 ('台積電', 547),
 ('們', 542),
 ('大法官', 523),
 ('連結', 523),
 ('日本', 516),
 ('立委', 491),
 ('問題', 490),
 ('你們', 489),
 ('國民黨', 473),
 ('新聞', 455),
 ('網友', 435),
 ('人士', 435),
 ('法院', 422),
 ('備註', 416),
 ('劉德華', 398),
 ('官員', 388),
 ('結果', 383),
 ('錢', 382),
 ('烏克蘭', 378),
 ('法官', 368),
 ('郭智輝', 356),
 ('人員', 353),
 ('關係', 353),
 ('經濟', 348),
 ('其中', 346),
 ('國', 346),
 ('青鳥', 345),
 ('立法院', 344),
 ('北檢', 338),
 ('政策', 335),
 ('東西', 333),
 ('世界', 330),
 (

In [293]:
aop_df.set_index("date").loc["2024/10/25"].reset_index()

Unnamed: 0,date,id,a,o,p,t
0,2024-10-25 11:52:49.141,jw1bDJMBx63wKxznie5G,波波,知,4.5054,NAT
1,2024-10-25 11:52:49.141,jw1bDJMBx63wKxznie5G,規則,打通,3.9600,NEG
2,2024-10-25 11:52:49.141,jw1bDJMBx63wKxznie5G,女兒,需要,5.4476,NAT
3,2024-10-25 11:52:49.141,jw1bDJMBx63wKxznie5G,資格,不符,3.8000,NEG
4,2024-10-25 11:52:49.141,jw1bDJMBx63wKxznie5G,對象,包含,5.1192,NAT
...,...,...,...,...,...,...
15992,2024-10-25 08:33:44.000,T06oEJMBuFMZnsg338W5,影響力,大,4.7196,NAT
15993,2024-10-25 08:33:44.000,T06oEJMBuFMZnsg338W5,地位,高,4.9450,NAT
15994,2024-10-25 14:05:56.829,fw3_DpMBx63wKxznn_6D,她,住在,5.0400,NAT
15995,2024-10-25 14:05:56.829,fw3_DpMBx63wKxznn_6D,人,住在,5.0400,NAT


In [382]:
aop_df.groupby("a")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EDE8194B50>

In [383]:
[
            {"source": "情感", "target": k, "value": v["p"].sum()}
            for k, v in aop_df.groupby("a")
        ]

[{'source': '情感', 'target': '', 'value': np.float64(202.7584)},
 {'source': '情感',
  'target': '\n\nwww.facebook.com',
  'value': np.float64(4.965000000000001)},
 {'source': '情感', 'target': '\n#', 'value': np.float64(11.84)},
 {'source': '情感', 'target': '\n10%', 'value': np.float64(7.2)},
 {'source': '情感', 'target': '\n10/25', 'value': np.float64(5.1412)},
 {'source': '情感', 'target': '\n15%\n', 'value': np.float64(7.2)},
 {'source': '情感', 'target': '\n26%', 'value': np.float64(4.3)},
 {'source': '情感', 'target': '\n29%', 'value': np.float64(6.0)},
 {'source': '情感', 'target': '\n38%', 'value': np.float64(7.2)},
 {'source': '情感', 'target': '\n39%\n', 'value': np.float64(5.8)},
 {'source': '情感', 'target': '\n4%', 'value': np.float64(4.4)},
 {'source': '情感', 'target': '\n40%', 'value': np.float64(3.2)},
 {'source': '情感', 'target': '\n40%\n', 'value': np.float64(3.4)},
 {'source': '情感', 'target': '\n48%', 'value': np.float64(7.2)},
 {'source': '情感', 'target': '\n62%\n', 'value': np.float64(7.

In [339]:
aop_df.groupby(["t", "o"]).size().sort_values(ascending=False).head(10)

t    o 
NAT  有     9170
     說     9121
     表示    4855
     報導    2670
     看     2667
     知道    2564
     想     2530
     覺得    2429
POS  好     2406
NAT  指出    2389
dtype: int64

In [370]:
select_o_aop_list = []
for tag, df in aop_df.groupby(["t"]):
    select_o = df["o"].value_counts().head(10).index.tolist()
    # print(select_o)
    select_o_aop_list.append(df.loc[df["o"].isin(select_o)])
    # print(df.loc[df.value_counts().head(10).index.tolist()])

In [385]:
pd.concat(select_o_aop_list)

Unnamed: 0,id,date,a,o,p,t
28,0g1bDJMBx63wKxznjfFO,2024-11-07 12:29:07.678,習俗,知道,5.4000,NAT
29,0g1bDJMBx63wKxznjfFO,2024-11-07 12:29:07.678,我,知道,5.4000,NAT
31,0g1bDJMBx63wKxznjfFO,2024-11-07 12:29:07.678,人,說,5.6034,NAT
34,0g1bDJMBx63wKxznjfFO,2024-11-07 12:29:07.678,我,知道,5.4000,NAT
41,0g1bDJMBx63wKxznjfFO,2024-11-07 12:29:07.678,口福,知道,5.4000,NAT
...,...,...,...,...,...,...
334607,7g2dDpMBx63wKxznMfRx,2024-10-27 11:49:53.383,大家,希望,7.0000,POS
334608,7g2dDpMBx63wKxznMfRx,2024-10-27 11:49:53.383,台灣,好,6.8000,POS
334671,8Q2dDpMBx63wKxznMfRx,2024-10-18 15:03:07.631,投票,喜歡,7.0000,POS
334708,4w2dDpMBx63wKxznMfRx,2024-10-18 05:37:14.907,心態,好,6.8000,POS


In [384]:
select_o_aop_list

[                          id                    date    a   o       p    t
 28      0g1bDJMBx63wKxznjfFO 2024-11-07 12:29:07.678   習俗  知道  5.4000  NAT
 29      0g1bDJMBx63wKxznjfFO 2024-11-07 12:29:07.678    我  知道  5.4000  NAT
 31      0g1bDJMBx63wKxznjfFO 2024-11-07 12:29:07.678    人   說  5.6034  NAT
 34      0g1bDJMBx63wKxznjfFO 2024-11-07 12:29:07.678    我  知道  5.4000  NAT
 41      0g1bDJMBx63wKxznjfFO 2024-11-07 12:29:07.678   口福  知道  5.4000  NAT
 ...                      ...                     ...  ...  ..     ...  ...
 334856  3w2dDpMBx63wKxznMfRx 2024-10-22 12:06:06.752  張啟楷   說  5.6034  NAT
 334862  3w2dDpMBx63wKxznMfRx 2024-10-22 12:06:06.752    他   說  5.6034  NAT
 334933  3Q2dDpMBx63wKxznMfRx 2024-10-19 22:59:38.912  蔡英文   說  5.6034  NAT
 334945  3Q2dDpMBx63wKxznMfRx 2024-10-19 22:59:38.912  蔡英文  表示  4.1894  NAT
 334958  2w2dDpMBx63wKxznMfRx 2024-10-29 03:00:21.710   獻金   有  4.3538  NAT
 
 [40726 rows x 6 columns],
                           id                    date    a 

In [389]:
l_df, r_df = list(combinations(select_o_aop_list, 2))[0]

In [391]:
merge_df = pd.merge(l_df, r_df, on=["id"], suffixes=("_l", "_r"))

In [408]:
merge_df["t_l"].unique().tolist()

['NAT']

In [409]:
for i in merge_df.groupby(["o_l", "o_r"]).size().items():
    print(i)

(('報導', '出現'), 402)
(('報導', '叫'), 153)
(('報導', '差'), 58)
(('報導', '強調'), 616)
(('報導', '打'), 121)
(('報導', '找'), 159)
(('報導', '署名'), 1320)
(('報導', '變成'), 129)
(('報導', '質疑'), 323)
(('報導', '達'), 387)
(('想', '出現'), 245)
(('想', '叫'), 414)
(('想', '差'), 258)
(('想', '強調'), 266)
(('想', '打'), 621)
(('想', '找'), 344)
(('想', '署名'), 260)
(('想', '變成'), 301)
(('想', '質疑'), 155)
(('想', '達'), 139)
(('指出', '出現'), 362)
(('指出', '叫'), 129)
(('指出', '差'), 92)
(('指出', '強調'), 808)
(('指出', '打'), 175)
(('指出', '找'), 183)
(('指出', '署名'), 1373)
(('指出', '變成'), 155)
(('指出', '質疑'), 530)
(('指出', '達'), 434)
(('有', '出現'), 1606)
(('有', '叫'), 1405)
(('有', '差'), 835)
(('有', '強調'), 1576)
(('有', '打'), 1611)
(('有', '找'), 1107)
(('有', '署名'), 1745)
(('有', '變成'), 895)
(('有', '質疑'), 1023)
(('有', '達'), 1153)
(('看', '出現'), 257)
(('看', '叫'), 327)
(('看', '差'), 291)
(('看', '強調'), 181)
(('看', '打'), 504)
(('看', '找'), 281)
(('看', '署名'), 226)
(('看', '變成'), 259)
(('看', '質疑'), 173)
(('看', '達'), 142)
(('知道', '出現'), 331)
(('知道', '叫'), 524)
(('知道', 

In [380]:
for l_df, r_df in combinations(select_o_aop_list, 2):
    l_df = l_df[['id', "o", "t"]]
    r_df = r_df[['id', "o", "t"]]
    print(pd.merge(l_df, r_df, on=["id"], suffixes=("_l", "_r")))

                         id o_l  t_l o_r  t_r
0      9A1bDJMBx63wKxznkPM6   想  NAT   差  NEG
1      9A1bDJMBx63wKxznkPM6   有  NAT   差  NEG
2      9A1bDJMBx63wKxznkPM6   有  NAT   差  NEG
3      9A1bDJMBx63wKxznkPM6   說  NAT   差  NEG
4      Bg3mBpMBx63wKxznz9G6  表示  NAT  出現  NEG
...                     ...  ..  ...  ..  ...
57807  3w2dDpMBx63wKxznMfRx   說  NAT  出現  NEG
57808  3w2dDpMBx63wKxznMfRx   說  NAT   差  NEG
57809  3w2dDpMBx63wKxznMfRx   說  NAT   叫  NEG
57810  3w2dDpMBx63wKxznMfRx   說  NAT  出現  NEG
57811  3w2dDpMBx63wKxznMfRx   說  NAT   差  NEG

[57812 rows x 5 columns]
                         id o_l  t_l o_r  t_r
0      0g1bDJMBx63wKxznjfFO  知道  NAT   好  POS
1      0g1bDJMBx63wKxznjfFO  知道  NAT   好  POS
2      0g1bDJMBx63wKxznjfFO   說  NAT   好  POS
3      0g1bDJMBx63wKxznjfFO  知道  NAT   好  POS
4      0g1bDJMBx63wKxznjfFO  知道  NAT   好  POS
...                     ...  ..  ...  ..  ...
57643  3w2dDpMBx63wKxznMfRx   說  NAT  通過  POS
57644  3w2dDpMBx63wKxznMfRx  指出  NAT  通過  POS
57645  3

In [277]:
group_df = aop_df.groupby("t")["o"].value_counts().to_frame()

In [278]:
group_df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
t,o,Unnamed: 2_level_1
NAT,有,9170
NAT,說,9121
NAT,表示,4855
NAT,報導,2670
NAT,看,2667
...,...,...
POS,麥,1
POS,點亮,1
POS,點唱,1
POS,鼎盛,1


In [279]:
tmp = list(group_df.groupby(level=0))[0][1]

In [280]:
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,count
t,o,Unnamed: 2_level_1
NAT,有,9170
NAT,說,9121
NAT,表示,4855
NAT,報導,2670
NAT,看,2667
NAT,...,...
NAT,齊刷刷,1
NAT,齊發,1
NAT,齊開,1
NAT,龜縮成,1


In [234]:
tmp.loc["NAT"].iloc[10:]["count"].sum()

np.int64(171765)

In [238]:
tmp.loc["NAT"].iloc[0:10]

Unnamed: 0_level_0,count
o,Unnamed: 1_level_1
有,9170
說,9121
表示,4855
報導,2670
看,2667
知道,2564
想,2530
覺得,2429
指出,2389
認為,2331


In [281]:
o_df = tmp.loc["NAT"]

In [254]:
o_df[5:].sum()

count    184008
dtype: int64

In [282]:
o_df

Unnamed: 0_level_0,count
o,Unnamed: 1_level_1
有,9170
說,9121
表示,4855
報導,2670
看,2667
...,...
齊刷刷,1
齊發,1
齊開,1
龜縮成,1


In [283]:
pd.concat([o_df[:5], pd.DataFrame([o_df[5:].sum()], index=["others"], columns=["count"])])

Unnamed: 0,count
有,9170
說,9121
表示,4855
報導,2670
看,2667
others,184008


In [273]:
o_df.query("count > 0")

Unnamed: 0,count
有,9170
說,9121
表示,4855
報導,2670
看,2667
others,184008


In [271]:
list(o_df.itertuples())

[Pandas(Index='有', count=9170),
 Pandas(Index='說', count=9121),
 Pandas(Index='表示', count=4855),
 Pandas(Index='報導', count=2670),
 Pandas(Index='看', count=2667),
 Pandas(Index='others', count=184008)]

In [267]:
int(list(o_df.iterrows())[0][1].iloc[0])

9170

In [212]:
[p for p, o in group_df.groupby(level=0)]

['NAT', 'NEG', 'POS']

In [214]:
group_df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
t,o,Unnamed: 2_level_1
NAT,有,9170
NAT,說,9121
NAT,表示,4855
NAT,報導,2670
NAT,看,2667
...,...,...
POS,麥,1
POS,點亮,1
POS,點唱,1
POS,鼎盛,1


In [209]:
list(group_df)

[(('NAT', '有'),
  count    9170
  Name: (NAT, 有), dtype: int64),
 (('NAT', '說'),
  count    9121
  Name: (NAT, 說), dtype: int64),
 (('NAT', '表示'),
  count    4855
  Name: (NAT, 表示), dtype: int64),
 (('NAT', '報導'),
  count    2670
  Name: (NAT, 報導), dtype: int64),
 (('NAT', '看'),
  count    2667
  Name: (NAT, 看), dtype: int64),
 (('NAT', '知道'),
  count    2564
  Name: (NAT, 知道), dtype: int64),
 (('NAT', '想'),
  count    2530
  Name: (NAT, 想), dtype: int64),
 (('NAT', '覺得'),
  count    2429
  Name: (NAT, 覺得), dtype: int64),
 (('NAT', '指出'),
  count    2389
  Name: (NAT, 指出), dtype: int64),
 (('NAT', '認為'),
  count    2331
  Name: (NAT, 認為), dtype: int64),
 (('NAT', '沒有'),
  count    1729
  Name: (NAT, 沒有), dtype: int64),
 (('NAT', '大'),
  count    1719
  Name: (NAT, 大), dtype: int64),
 (('NAT', '做'),
  count    1686
  Name: (NAT, 做), dtype: int64),
 (('NAT', '讓'),
  count    1686
  Name: (NAT, 讓), dtype: int64),
 (('NAT', '高'),
  count    1660
  Name: (NAT, 高), dtype: int64),
 (('NAT', '

In [200]:
aop_df.groupby("t").get_group("POS")["o"].value_counts()

o
好     2406
支持     961
喜歡     792
希望     712
愛      616
      ... 
豐收       1
擔綱       1
激化       1
歌訟       1
嘗到       1
Name: count, Length: 2579, dtype: int64

In [173]:
aop_df[(aop_df["date"] < "2024-11-02") & (aop_df["date"] >= "2024-11-01")]

Unnamed: 0,id,date,a,o,p,t
4542,eg4AD5MBx63wKxznQgDy,2024-11-01 05:38:27.706,人,住,4.2380,0
4543,eg4AD5MBx63wKxznQgDy,2024-11-01 05:38:27.706,我,用,4.6334,0
4544,eg4AD5MBx63wKxznQgDy,2024-11-01 05:38:27.706,網,差,3.4000,-1
4545,eg4AD5MBx63wKxznQgDy,2024-11-01 05:38:27.706,信,差,3.4000,-1
4546,eg4AD5MBx63wKxznQgDy,2024-11-01 05:38:27.706,本人,懂,3.9428,-1
...,...,...,...,...,...,...
334549,-Q2dDpMBx63wKxznMfRx,2024-11-01 08:27:26.006,影片,有,4.3538,0
334550,-Q2dDpMBx63wKxznMfRx,2024-11-01 08:27:26.006,中選會,想,5.2000,0
334551,-Q2dDpMBx63wKxznMfRx,2024-11-01 08:27:26.006,事情,壓下來,3.2600,-1
334552,-Q2dDpMBx63wKxznMfRx,2024-11-01 08:27:26.006,票,開完,5.3000,0


In [None]:
id_df = aop_df.groupby(pd.Grouper(key="date", freq="D"))["id"].agg(['unique'])

In [None]:
type(id_df)

In [None]:
list(enumerate(pd.concat([id_df, id_df], axis=1).loc["2024-10-26"]))

In [None]:
list(pd.concat([id_df, id_df], axis=1).loc["2024-10-26"].items())

In [None]:
str(datetime.date.fromtimestamp(1730310459813.0842/1000))

In [None]:
aop_df_date_p.loc[datetime.date.fromtimestamp(1730310459813.0842/1000)]

In [None]:
[i.isoformat()+"Z" for i in aop_df_date_p.index.to_list()]

In [None]:
id_df.loc["2024-10-26":"2024-10-27"].T

In [None]:
tmp = list(id_df.loc["2024-10-26":"2024-10-27"].T.iterrows())

In [None]:
import numpy as np

In [None]:
tmp[0][1].to_list()

In [None]:
np.concatenate(tmp[0][1].to_list()).tolist()

In [None]:
tmp = [(i, j) for i, j in enumerate(id_df.loc["2024-10-26":"2024-10-27"].iterrows())]

In [None]:
id_df

In [None]:
tmp[0][1][1].tolist()

In [None]:
id_df.loc["2024-10-26"].apply(lambda x: x[:5].tolist())

In [None]:
aop_df_date_p = aop_df.groupby(pd.Grouper(key="date", freq="D"))["p"].mean()

In [None]:
aop_df_date_t = aop_df.groupby(pd.Grouper(key="date", freq="D"))["t"].mean()

In [None]:
pd.DataFrame([aop_df_date_p, aop_df_date_p]).T.columns