本段開始進行BERTopic主題模型說明

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/SMA_final/') #切換該目錄
os.listdir() #確認目錄內容

In [None]:
# !pip install sentence_transformers
# !pip install bertopic

In [None]:
import pandas as pd
import re
import numpy as np
from collections import defaultdict
import multiprocessing
import jieba
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager
from sentence_transformers import SentenceTransformer

# 設定字體
fontManager.addfont('./copy/TaipeiSansTCBeta-Regular.ttf')
plt.rcParams['font.sans-serif'] = ['Taipei Sans TC Beta']
plt.rcParams['font.size'] = '16'

# Transformer 模型下的主題詮釋與文本關聯：以 BERT 嵌入為基礎的主題分析

使用 BERT 中文語意嵌入、HDBSCAN 分群與 BERTopic 主題建模技術，進行主題抽取與語意詮釋。最後，我們利用 GPT API 為每個主題生成描述。

---

## Step 1：資料讀取與清洗


In [None]:
import os
import pandas as pd

# 讀取資料
df = pd.read_csv('chiikawa_unified_data.csv')


In [None]:
# 對文本進行完整的清理
df['clean_content'] = df['artContent'].astype(str)
# 將多個連續換行符替換為句號
df['clean_content'] = df['clean_content'].str.replace(r'\n\n', '。', regex=True)
# 將單個換行符替換為逗號
df['clean_content'] = df['clean_content'].str.replace(r'\n', '，', regex=True)
# 移除網址
df['clean_content'] = df['clean_content'].str.replace(r'http\S+', '', regex=True).str.replace(r'www\S+', '', regex=True)
# 統一標點符號，移除表情符號和特殊符號
df['clean_content'] = df['clean_content'].str.replace(r'[^\u4e00-\u9fff\s.,!?;:、，。！？；：]', '', regex=True)

# 使用正則表達式進行更精確的斷句
def split_sentences(text):
    # 以句號、問號、感嘆號等作為斷句標記
    sentences = re.split(r'[。！？!?]+', text)
    # 移除空字符串和只包含空白的字符串
    return [s.strip() for s in sentences if s.strip()]

df['sentences'] = df['clean_content'].apply(split_sentences)

# 檢視斷句結果
sample_sentences = df[df['sentences'].str.len() > 0].iloc[0]['sentences']
print(f"斷句範例：{sample_sentences[:3]}")  # 顯示前3個句子

In [None]:
# 設定jieba的字典和使用者自定義詞典
jieba.set_dictionary('dict/dict.txt.big')  # 使用繁體中文字典

# 新增特定領域詞彙到自定義詞典
custom_words = [
    # 主要名稱
    "Chiikawa", "吉伊卡哇", "ちいかわ", "吉伊", "小可愛", "吉依", "寶寶",

    # 主要角色
    "小八貓", "ハチワレ", "Hachiware", "八字瀏海", "小八", "藍色褲頭貓", "哈奇" , "哈吉",
    "兔兔", "うさぎ", "Usagi", "烏薩奇", "兔哥", "537",
    "小桃", "モモンガ", "Momonga", "飛鼠",
    "海獺勇者", "ラッコ", "Rakko",
    "栗子饅頭", "くりまんじゅう", "Kuri-Manjuu", "前輩",

    # 其他角色
    "風獅", "シーサー", "Shisa", "獅薩",
    "拉麵鎧甲人", "手拿包鎧甲人", "勞動鎧甲人", "鎧甲人",
    "睡衣派對", "パジャマパーティーズ", "哥布林",

    # 特殊詞彙
    "吉伊卡哇構文", "除草考試", "除草檢定", "小可愛族",
    "郎拉麵店", "三級除草證件", "討伐", "抽獎運",

    # 特性描述
    "膽小鬼", "愛哭鬼", "療癒", "二頭身", "啊哈", "吖哈",
    "又小又可愛", "なんか小さくてかわいいやつ",

    # 創作相關
    "Nagano", "ナガノ", "動畫工房", "周邊商品", "貼圖", "一番賞"
]

for word in custom_words:
    jieba.add_word(word, freq=100)  # 給予較高詞頻以確保能被切分出來

# 建立角色同義詞映射表
character_synonyms = {
    # 主角同義詞映射
    "ちいかわ": "吉伊卡哇", "Chiikawa": "吉伊卡哇", "吉伊": "吉伊卡哇",
    "小可愛": "吉伊卡哇", "吉依": "吉伊卡哇",

    # 小八貓同義詞映射
    "ハチワレ": "小八貓", "Hachiware": "小八貓", "八字瀏海": "小八貓",
    "小八": "小八貓", "藍色褲頭貓": "小八貓", "哈奇": "小八貓", "哈吉": "小八貓",

    # 兔兔同義詞映射
    "うさぎ": "兔兔", "Usagi": "兔兔", "烏薩奇": "兔兔",
    "兔哥": "兔兔", "537": "兔兔",

    # 其他角色同義詞映射
    "モモンガ": "小桃", "Momonga": "小桃", "飛鼠": "小桃",
    "ラッコ": "海獺勇者", "Rakko": "海獺勇者",
    "くりまんじゅう": "栗子饅頭", "Kuri-Manjuu": "栗子饅頭", "前輩": "栗子饅頭",
    "シーサー": "風獅", "Shisa": "風獅", "獅薩": "風獅",
    "パジャマパーティーズ": "睡衣派對", "哥布林": "睡衣派對"
}

# 載入停用詞
def load_stopwords(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            stopwords = [line.strip() for line in f.readlines()]
        return set(stopwords)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='big5') as f:
            stopwords = [line.strip() for line in f.readlines()]
        return set(stopwords)
    except Exception as e:
        print(f"讀取停用詞檔案時發生錯誤: {e}")
        return set()

# 讀取停用詞
stopwords_path = "./dict/stopwords.txt"
stopwords = load_stopwords(stopwords_path)
print(f"已載入 {len(stopwords)} 個停用詞")

# 新增自定義停用詞
custom_stopwords = [
    # 代詞
    "我", "你", "他", "她", "它", "我們", "你們", "他們", "她們", "它們", "這", "那", "這些", "那些", "誰", "什麼", "哪", "哪裡", "哪兒", "怎麼", "怎樣", "如何",

    # 連接詞
    "和", "與", "而", "並", "或", "但", "但是", "然而", "所以", "因為", "因此", "如果", "雖然", "即使", "無論", "只要", "不論", "假如", "若", "若是",

    # 助詞
    "的", "地", "得", "了", "著", "過", "吧", "嗎", "呢", "啊", "哦", "喔", "呀", "耶", "哎", "唉", "嗯", "嘿",

    # 副詞
    "很", "非常", "極", "太", "更", "最", "又", "也", "都", "還", "只", "就", "才", "剛", "曾", "已", "將", "不", "沒", "別", "莫", "勿",

    # 介詞
    "在", "從", "向", "往", "於", "對", "給", "為", "替", "由", "把", "被", "讓", "使", "隨", "跟", "靠", "據", "至", "到", "自",

    # 數量詞
    "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "百", "千", "萬", "億", "兆", "個", "些", "每", "某", "各", "整", "全", "多", "少", "幾",

    # 時間詞
    "年", "月", "日", "時", "分", "秒", "天", "週", "季", "今", "昨", "明", "前", "後", "當", "正", "現", "曾", "已", "將", "會", "常", "永",

    # 方位詞
    "上", "下", "左", "右", "前", "後", "內", "外", "中", "東", "西", "南", "北", "中間", "旁邊", "附近", "周圍",

    # 常用動詞
    "是", "有", "無", "來", "去", "做", "看", "聽", "說", "想", "要", "能", "會", "可以", "應該", "必須", "需要", "得", "獲", "取", "予",

    # 常用形容詞
    "好", "壞", "大", "小", "多", "少", "高", "低", "長", "短", "新", "舊", "快", "慢", "遠", "近", "輕", "重", "深", "淺",

    # 其他常見虛詞
    "之", "乎", "以", "其", "所", "者", "矣", "焉", "哉", "也", "然", "否", "兮", "乃", "且", "為", "則", "再", "又", "此", "夫", "亦",

    # 常見短語
    "一個", "這個", "那個", "一些", "這些", "那些", "一下", "一點", "一直", "一定", "一般", "一樣", "不過", "不必", "不要", "可能", "可是", "只是", "就是", "如此", "如果", "這樣", "那樣", "這麼", "那麼", "因此", "所以", "然後", "接著", "其實", "其中", "其他", "其它", "之前", "之後", "之中", "之間",

    # 網路用語
    "http", "www", "com", "cn", "org", "net", "html", "htm", "php", "jpg", "png", "gif", "..", "\r"
]
stopwords.update(custom_stopwords)



In [None]:
def preprocess(text, stopwords):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    # 移除停用詞與空字串
    filtered_words = [w for w in words if w.strip() and w not in stopwords]
    return " ".join(filtered_words)  # 用空白分隔，方便 CountVectorizer 使用

# 套用到 df["clean_content"]
df["processed_text"] = df["clean_content"].apply(lambda x: preprocess(x, stopwords))

In [None]:
df.head()

In [None]:
# 假設「artContent」是文章內容，「category」是標籤
df = df.rename(columns={'processed_text': 'text', 'category': 'label'})

# 移除缺失值
df = df.dropna(subset=['text', 'label'])

## Step 2：建立繁體中文嵌入與分詞工具
使用 `google-bert/bert-base-chinese` 作為語意嵌入模型，並搭配 `jieba` 進行繁體中文斷詞


In [None]:
# 套用到每篇文章
docs_zh = df['text'].tolist()
docs_zh = [doc for doc in docs_zh if doc.strip()]

# === Step 3: BERT 中文嵌入 ===
bert_sentence_model = SentenceTransformer("google-bert/bert-base-chinese")
embeddings = bert_sentence_model.encode(docs_zh, show_progress_bar=True)

## Step 3：BERTopic 主題模型訓練（含自訂 jieba 分詞器）
我們使用 HDBSCAN 進行主題分群，並以 TF 向量從每群中擷取代表性詞彙

In [None]:
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic


hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    min_samples=5,
    metric='euclidean',
    cluster_selection_method='leaf'
)
# 中文分詞函數
def tokenize_zh(text):
    return jieba.lcut(text)

# 使用 jieba 分詞的 CountVectorizer
jieba_vectorizer = CountVectorizer(
    tokenizer=tokenize_zh,
    stop_words=list(stopwords),
    analyzer='word',
    token_pattern=u"(?u)\\b\\w+\\b"
)

# 建立 BERTopic 模型
zh_topic_model = BERTopic(
    embedding_model=bert_sentence_model,
    vectorizer_model=jieba_vectorizer,
    hdbscan_model=hdbscan_model,
    verbose=True,
    top_n_words=30
)

# 執行模型訓練
topics, probs = zh_topic_model.fit_transform(docs_zh, embeddings)


## Step 4：主題檢視與關鍵詞提取
查看每個主題的代表性關鍵詞與文章數

In [None]:
zh_topic_model.reduce_topics(docs_zh, nr_topics=20)
zh_topic_model.get_topic_info().head(10)

互動式圖表探索主題空間與主題關鍵詞

In [None]:
zh_topic_model.visualize_topics()

## Step 5：使用 GPT 為主題自動生成描述
我們將每個主題的前 10 個關鍵詞輸入 GPT，讓它推測此群主題所屬分類描述

In [None]:
# !pip install openai

In [None]:
# 讀取API key
with open("gpt_api_key.txt", "r", encoding="utf-8") as f:
    key = f.read().strip()

In [None]:
from openai import OpenAI
client = OpenAI(api_key=key)
def call_gpt(keywords):
    try:
        prompt = f"""你是一位主題詮釋專家。請根據以下關鍵詞推論它們所代表的主題，並以繁體中文簡要描述該主題的內容。

關鍵詞：{", ".join(keywords)}

請只回覆主題敘述，不需要列出主題類別，也不要加任何額外說明。"""
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8
        )
        output = response.choices[0].message.content.strip()
        return output
    except Exception as e:
        print(f"⚠️ Error: {e}")
        return ""

In [None]:
import openai
import time

# 擷取主題關鍵詞
topic_keywords = {
    topic: [w[0] for w in zh_topic_model.get_topic(topic)[:10]]
    for topic in zh_topic_model.get_topics().keys()
    if topic != -1
}

topic_keywords


In [None]:
# 執行 GPT 詮釋
gpt_labels = {}
for topic_id, keywords in topic_keywords.items():
    print(f"\n 主題 {topic_id} 關鍵字: {', '.join(keywords)}")
    label = call_gpt(keywords)
    print(f"GPT 推論：{label}")
    time.sleep(1.5)
