# 0. 导入必要的库

In [None]:
import json
import numpy as np
from bertopic import BERTopic
from transformers.pipelines import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pandas as pd
from sklearn.metrics import silhouette_score # 移到这里，因为它在模型配置部分被使用

# 1. 数据导入与预处理

In [10]:
# （1）指定文件路径
tokenized_jsonl_path = "/Users/jhx/Documents/Code/黑神话女性数据/0数据/antifeminism_tokenized.jsonl"  # 修改为实际路径
embeddings_path = "/Users/jhx/Documents/Code/黑神话女性数据/0数据/antifeminism_emb.npy"           # 修改为实际路径
stopwords_path = "/Users/jhx/Documents/Code/stopword.txt"                                    # 修改为实际路径

# （2）加载分词文本及原始文本
docs_tokens = []
docs_text = []
with open(tokenized_jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        record = json.loads(line)
        tokens = record.get("tokens", [])
        docs_tokens.append(" ".join(tokens))
        docs_text.append(record["text"])

print("文本总数:", len(docs_tokens))
if docs_tokens:
    print("预览第一条分词结果:", docs_tokens[0])
    print("预览第一条原始文本:", docs_text[0][:100], "...")

# （3）加载预生成的向量
embeddings = np.load(embeddings_path)
print("加载完成的向量形状:", embeddings.shape)

# （4）加载停用词表
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stop_words_list = f.read().splitlines()


文本总数: 2150
预览第一条分词结果: 黑神话 年度 游戏 舆论 机器 上映 异形 唯一 一位 华裔女性 莫名其妙 抱脸虫 袭击 产下 异形 黄祸论 变种 确实 肯定 喷惨
预览第一条原始文本: 黑神话不可能拿年度游戏的，不管做的多好。这些舆论机器就是用来搞我们的，看看刚上映的异形，唯一一位华裔女性莫名其妙就被抱脸虫袭击产下异形，这不就是黄祸论的新变种吗！有一说一 确实是 肯定会被喷惨 ​​ ...
加载完成的向量形状: (2150, 768)


# 2. 模型配置与训练

In [18]:
from sklearn.metrics import silhouette_score
# （1）词向量模型（此处为占位，可根据需求替换）
embedding_model = pipeline("feature-extraction", model="bert-base-chinese")

# （2）构造 CountVectorizer 向量器
vectorizer_model = CountVectorizer(
    stop_words=stop_words_list,
    ngram_range=(1, 1),
    max_df=0.999,
    min_df=5
)

# （3）设置可修改的参数组合，探索不同 UMAP 和 HDBSCAN 参数
umap_params_list = [
    {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0, "metric": "cosine", "random_state": 42}
    #{"n_neighbors": 10, "n_components": 5, "min_dist": 0.1, "metric": "cosine", "random_state": 42}
]
hdbscan_params_list = [
    {"min_cluster_size": 20, "min_samples": 5, "metric": "euclidean"}
    #{"min_cluster_size": 15, "min_samples": 3, "metric": "euclidean"}
]

best_silhouette = -1
best_topic_model = None
best_topics = None
best_probs = None
best_umap_params = None
best_hdbscan_params = None

# 遍历参数组合，训练模型并计算聚类指标（silhouette score）
for up in umap_params_list:
    for hp in hdbscan_params_list:
        current_umap_model = UMAP(**up)
        current_hdbscan_model = HDBSCAN(**hp)
        current_topic_model = BERTopic(
            embedding_model=embedding_model,
            vectorizer_model=vectorizer_model,
            umap_model=current_umap_model,
            hdbscan_model=current_hdbscan_model,
            top_n_words=10
        )
        current_topics, current_probs = current_topic_model.fit_transform(docs_tokens, embeddings=embeddings)
        # 计算轮廓系数（忽略噪声：主题编号为 -1 的文档）
        valid_idx = [i for i, t in enumerate(current_topics) if t != -1]
        if len(valid_idx) > 1:
            valid_embeddings = embeddings[valid_idx]
            valid_labels = [current_topics[i] for i in valid_idx]
            score = silhouette_score(valid_embeddings, valid_labels, metric="cosine")
        else:
            score = -1
        print(f"UMAP参数: {up}, HDBSCAN参数: {hp}, Silhouette Score: {score}")
        if score > best_silhouette:
            best_silhouette = score
            best_topic_model = current_topic_model
            best_topics = current_topics
            best_probs = current_probs
            best_umap_params = up
            best_hdbscan_params = hp

print(f"最佳参数：UMAP: {best_umap_params}, HDBSCAN: {best_hdbscan_params}, Silhouette Score: {best_silhouette}")

# （4）自动合并相似主题：调用 reduce_topics 设置为 "auto"
best_topic_model.reduce_topics(docs_tokens, nr_topics="auto")
print("已自动合并相似主题。")

Device set to use mps:0


UMAP参数: {'n_neighbors': 15, 'n_components': 5, 'min_dist': 0.0, 'metric': 'cosine', 'random_state': 42}, HDBSCAN参数: {'min_cluster_size': 20, 'min_samples': 5, 'metric': 'euclidean'}, Silhouette Score: 0.30487388372421265
最佳参数：UMAP: {'n_neighbors': 15, 'n_components': 5, 'min_dist': 0.0, 'metric': 'cosine', 'random_state': 42}, HDBSCAN: {'min_cluster_size': 20, 'min_samples': 5, 'metric': 'euclidean'}, Silhouette Score: 0.30487388372421265
已自动合并相似主题。


# 3. 聚类结果输出与保存

In [19]:
# （0）设置自定义后缀：在实际使用中，可手动修改 suffix 来区别不同任务或实验
suffix = "antifeminism3.23-1"  # 根据实际需要更改

# （1）输出文档级结果：主题编号、概率等，并替换回原始文本
doc_info_df = best_topic_model.get_document_info(docs_tokens)
doc_info_df["Document"] = docs_text
doc_info_filename = f"topic_document_info_auto_{suffix}.csv"
doc_info_df.to_csv(doc_info_filename, index=False, encoding="utf-8-sig")
print(f"自动聚类后的文档主题信息已保存至 {doc_info_filename}")

# （2）输出主题汇总表：主题编号、主题名称、主题词、代表文档及文档数量
topic_info_df = best_topic_model.get_topic_info()
if "Count" in topic_info_df.columns:
    topic_info_df.rename(columns={"Count": "Number of Documents"}, inplace=True)

rep_docs_dict = best_topic_model.get_representative_docs(docs_tokens)
rep_docs_original = {}
for topic, rep_docs in rep_docs_dict.items():
    new_rep = []
    for rep_doc in rep_docs:
        try:
            idx = docs_tokens.index(rep_doc)
        except ValueError:
            idx = None
        if idx is not None:
            new_rep.append(docs_text[idx])
        else:
            new_rep.append(rep_doc)
    rep_docs_original[topic] = new_rep

topic_info_df["Representative_Docs"] = topic_info_df["Topic"].apply(lambda t: rep_docs_original.get(t, []))
if "Top_n_words" not in topic_info_df.columns:
    topic_info_df["Top_n_words"] = topic_info_df["Representation"]

topic_info_df = topic_info_df.sort_values(by="Topic")
topic_info_df = topic_info_df[["Topic", "Name", "Representation", "Representative_Docs", "Top_n_words", "Number of Documents"]]

topic_summary_filename = f"topic_summary_auto_{suffix}.csv"
topic_info_df.to_csv(topic_summary_filename, index=False, encoding="utf-8-sig")
print(f"自动聚类后的主题汇总信息已保存至 {topic_summary_filename}")

# （3）输出主题词与得分
topics_dict = best_topic_model.get_topics()  # {topic_id: [(word, score), ...], ...}
data_rows = []
for t_id, word_score_list in topics_dict.items():
    for (word, score) in word_score_list:
        data_rows.append({"Topic": t_id, "Word": word, "Score": score})
topic_word_scores_df = pd.DataFrame(data_rows)
topic_word_filename = f"topic_word_scores_auto_{suffix}.csv"
topic_word_scores_df.to_csv(topic_word_filename, index=False, encoding="utf-8-sig")
print(f"自动聚类后的主题词得分表已保存为 {topic_word_filename}")

自动聚类后的文档主题信息已保存至 topic_document_info_auto_antifeminism3.23-1.csv
自动聚类后的主题汇总信息已保存至 topic_summary_auto_antifeminism3.23-1.csv
自动聚类后的主题词得分表已保存为 topic_word_scores_auto_antifeminism3.23-1.csv


# 4. 结果可视化

## 4-1主题条形图 Topic Bar Chart

In [20]:
fig_bar = best_topic_model.visualize_barchart(top_n_topics=10)
fig_bar_filename = f"topic_barchart_auto_{suffix}.html"
fig_bar.write_html(fig_bar_filename)
print(f"自动聚类后的主题条形图已保存为 {fig_bar_filename}")
fig_bar.show()

自动聚类后的主题条形图已保存为 topic_barchart_auto_antifeminism3.23-1.html


## 4-2 主题关系图 Intertopic Distance Map

In [21]:
fig_topic = best_topic_model.visualize_topics()
fig_topic_filename = f"topic_relationship_auto_{suffix}.html"
fig_topic.write_html(fig_topic_filename)
print(f"自动聚类后的主题关系图已保存为 {fig_topic_filename}")
fig_topic.show()

自动聚类后的主题关系图已保存为 topic_relationship_auto_antifeminism3.23-1.html


## 4-3 文档分布图 Document DataMap

In [22]:
reduced_embeddings = UMAP(
    n_neighbors=10,
    n_components=3,
    min_dist=0.0,
    metric="cosine",
    random_state=42
).fit_transform(embeddings)
fig_docs = best_topic_model.visualize_documents(
    docs_text,
    embeddings=reduced_embeddings,
    hide_document_hover=True
)
fig_docs_filename = f"document_distribution_auto_{suffix}.html"
fig_docs.write_html(fig_docs_filename)
print(f"自动聚类后的文档分布图已保存为 {fig_docs_filename}")
fig_docs.show()


自动聚类后的文档分布图已保存为 document_distribution_auto_antifeminism3.23-1.html


## 4-4 主题相似度热图 Topic Similarity Heatmap

In [23]:
fig_heatmap = best_topic_model.visualize_heatmap()
fig_heatmap_filename = f"topic_heatmap_auto_{suffix}.html"
fig_heatmap.write_html(fig_heatmap_filename)
print(f"自动聚类后的主题相似度热图已保存为 {fig_heatmap_filename}")
fig_heatmap.show()

自动聚类后的主题相似度热图已保存为 topic_heatmap_auto_antifeminism3.23-1.html


## 4-5 层次聚类 Hierarchical Clustering

In [24]:
custom_distance_func = lambda X: np.maximum(0, 1 - cosine_similarity(X))
hierarchical_topics = best_topic_model.hierarchical_topics(
    docs_tokens,
    distance_function=custom_distance_func
)
fig_hierarchy = best_topic_model.visualize_hierarchy(
    hierarchical_topics=hierarchical_topics,
    distance_function=custom_distance_func
)
fig_hierarchy_filename = f"hierarchical_topics_auto_{suffix}.html"
fig_hierarchy.write_html(fig_hierarchy_filename)
print(f"自动聚类后的层次聚类结果已保存为 {fig_hierarchy_filename}")
fig_hierarchy.show()

100%|██████████| 32/32 [00:00<00:00, 260.87it/s]


自动聚类后的层次聚类结果已保存为 hierarchical_topics_auto_antifeminism3.23-1.html
