In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import ast
import csv

def load_dictionary(file_path):
    term_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            term, term_id = line.strip().split(': ')
            term_dict[term] = int(term_id)
    return term_dict

def load_vectors(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    df['vector'] = df['vector'].apply(lambda x: np.array(eval(x), dtype=np.float64))
    return df

def load_cluster_results(file_path):
    cluster_df = pd.read_csv(file_path, encoding='utf-8')
    cluster_df['members'] = cluster_df['members'].apply(lambda x: x.split(','))
    return cluster_df

def load_texts(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    df = df[['id', 'text']]
    return df

dictionary = load_dictionary('/content/drive/MyDrive/IRTM_final_project/vectors/dictionary.txt')
vectors_df = load_vectors('/content/drive/MyDrive/IRTM_final_project/vectors/1220_vector.csv')




FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/IRTM_final_project/data/posts/1220.csv'

In [None]:
texts_df = load_texts('/content/drive/MyDrive/IRTM_final_project/data/posts/1220.csv')

print(f"Dictionary Loaded: {len(dictionary)} terms")
print(f"Vectors Loaded: {vectors_df.shape[0]} articles")
print(f"Texts Loaded: {texts_df.shape[0]} articles")


Dictionary Loaded: 5000 terms
Vectors Loaded: 47967 articles
Cluster Results Loaded: 8 clusters
Texts Loaded: 80921 articles


  df = pd.read_csv(file_path, encoding='utf-8')


In [None]:
cluster_df = load_cluster_results('/content/drive/MyDrive/IRTM_final_project/cluster_result/kmeans.csv')

print(f"Cluster Results Loaded: {cluster_df.shape[0]} clusters")

Cluster Results Loaded: 8 clusters


In [None]:
import re
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)


    tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    english_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    chinese_tokens = jieba.lcut(text)

    combined_tokens = english_tokens + chinese_tokens
    return ' '.join(combined_tokens)

texts_df['processed_text'] = texts_df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def extract_keywords_tfidf(df, cluster_df, vectors_df, dictionary, top_n=10):
    feature_names = list(dictionary.keys())

    cluster_keywords = {}

    for _, cluster in cluster_df.iterrows():
        cluster_name = cluster['cluster_name']
        cluster_members = cluster['members']

        cluster_vectors = vectors_df[vectors_df['id'].isin(cluster_members)]['vector']

        # 將這些 numpy.ndarray 堆疊成矩陣
        cluster_tfidf_matrix = np.vstack(cluster_vectors)

        # 現在可以計算 TF-IDF 分數
        tfidf_scores = cluster_tfidf_matrix.sum(axis=0)

        sorted_indices = tfidf_scores.argsort()[::-1]

        top_keywords = [feature_names[i] for i in sorted_indices[:top_n]]

        cluster_keywords[cluster_name] = top_keywords

    return cluster_keywords

cluster_keywords_tfidf = extract_keywords_tfidf(texts_df, cluster_df, vectors_df, dictionary)
print(cluster_keywords_tfidf)

{'cluster0': ['今天', 'the', '怎麼', '這個', 'day', '看到', '這樣', '演唱', '還是', '世界'], 'cluster1': ['真的', '看到', '這個', '今天', '不要', '可以', '時候', '這樣', '啊啊啊', '謝謝'], 'cluster2': ['可以', '我們', '大家', '一個', '台灣', '就是', '一起', '知道', '希望', '他們'], 'cluster3': ['喜歡', '真的', '一個', '這個', '還是', '時候', '自己', '一起', '大家', '看到'], 'cluster4': ['什麼', '知道', '到底', '時候', '真的', '可以', '這是', '意思', '這麼', '這個'], 'cluster5': ['不是', '有人', '怎麼', '真的', '而是', '可以', '知道', '看到', '因為', '這樣'], 'cluster6': ['自己', '一個', '覺得', '就是', '因為', '真的', '時候', '知道', '不要', '可以'], 'cluster7': ['nan', '右邊', '司機', '司法', '吃掉', '吃過', '各位', '各個', '各地', '城市']}


In [None]:
from collections import Counter

def extract_keywords_word_frequency(df, cluster_df, top_n=10):
    cluster_keywords = {}

    for _, cluster in cluster_df.iterrows():
        cluster_members = cluster['members']
        cluster_texts = df[df['id'].isin(cluster_members)]['processed_text']
        words = ' '.join(cluster_texts).split()
        word_counts = Counter(words)
        top_keywords = [word for word, count in word_counts.most_common(top_n)]
        cluster_keywords[cluster['cluster_name']] = top_keywords

    return cluster_keywords

cluster_keywords_word_freq = extract_keywords_word_frequency(texts_df, cluster_df)
print(cluster_keywords_word_freq)

{'cluster0': ['的', '我', '了', '是', '在', '你', '有', '都', '也', '好'], 'cluster1': ['真的', '的', '我', '了', '是', '好', '很', '在', '都', '你'], 'cluster2': ['的', '我', '是', '了', '在', '有', '你', '都', '也', '可以'], 'cluster3': ['的', '喜歡', '我', '你', '是', '好', '了', '很', '人', '都'], 'cluster4': ['的', '什麼', '我', '是', '了', '你', '在', '為', '都', '有'], 'cluster5': ['的', '我', '不是', '是', '有人', '了', '你', '在', '有', '也'], 'cluster6': ['的', '自己', '我', '你', '是', '了', '在', '有', '都', '人'], 'cluster7': ['nan']}


In [None]:
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer

def extract_keywords_chi2(df, cluster_df, top_n=10):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['processed_text'])

    cluster_keywords = {}

    for _, cluster in cluster_df.iterrows():
        cluster_name = cluster['cluster_name']
        cluster_members = cluster['members']

        y = []
        for idx in df['id']:
            if idx in cluster_members:
                y.append(1)  # 屬於該 cluster
            else:
                y.append(0)  # 不屬於該 cluster

        y = np.array(y)

        chi2_selector = SelectKBest(chi2, k=top_n)
        chi2_selector.fit(X, y)

        feature_names = vectorizer.get_feature_names_out()
        top_keywords = [feature_names[i] for i in chi2_selector.get_support(indices=True)]

        cluster_keywords[cluster_name] = top_keywords

    return cluster_keywords

cluster_keywords_chi2 = extract_keywords_chi2(texts_df, cluster_df)
print(cluster_keywords_chi2)

{'cluster0': ['nan', '一個', '不是', '什麼', '可以', '喜歡', '大家', '我們', '真的', '自己'], 'cluster1': ['httpswaatwmexvc', '劉知珉', '好棒', '有夠', '真的', '絕了', '美短', '自己', '財神爺', '超級'], 'cluster2': ['一個', '一起', '他們', '可以', '台灣', '因為', '大家', '就是', '希望', '我們'], 'cluster3': ['che', 'dororo', 'yaaa', '三紀', '千變', '喜歡', '小久', '數碼', '萬化', '語句'], 'cluster4': ['httpswatchouttwreportsvqubfihsoqyrdarrma', 'mcdonaldstw', '什麼', '到底', '小黨', '意思', '我為', '清潔隊', '知道', '給什麼'], 'cluster5': ['不是', '依托', '大未必佳', '娜拉', '德伯格', '有人', '東亞', '神祇', '總幹事', '而是'], 'cluster6': ['一個', '事情', '人生', '他人', '別人', '因為', '生活', '自己', '自我', '覺得'], 'cluster7': ['nan', '一個', '什麼', '可以', '喜歡', '大家', '就是', '我們', '真的', '自己']}


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def extract_keywords_pmi(df, cluster_df, top_n=10):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['processed_text'])

    term_freq = np.array(X.sum(axis=0)).flatten()  # 這是所有文檔中每個詞語的頻次
    total_terms = term_freq.sum()  # 所有詞語的總和

    cluster_keywords = {}
    for _, cluster in cluster_df.iterrows():
        cluster_members = cluster['members']
        cluster_texts = df[df['id'].isin(cluster_members)]['processed_text']
        cluster_X = vectorizer.transform(cluster_texts)  # 計算每個 cluster 的詞頻
        term_freq_cluster = np.array(cluster_X.sum(axis=0)).flatten()

        pmi_scores = []
        for i, term in enumerate(vectorizer.get_feature_names_out()):
            # 計算 PMI，避免除以0的情況
            if term_freq[i] > 0 and term_freq_cluster[i] > 0:
                pmi = np.log((term_freq_cluster[i] * total_terms) / (term_freq[i] * X.shape[0]))
            else:
                pmi = 0  # 如果有 0 會給 PMI 設為 0 或 np.nan
            pmi_scores.append((term, pmi))

        # 根據 PMI 排序，選出 top_n 的關鍵字
        sorted_pmi = sorted(pmi_scores, key=lambda x: x[1], reverse=True)
        top_keywords = [term for term, _ in sorted_pmi[:top_n]]
        cluster_keywords[cluster['cluster_name']] = top_keywords

    return cluster_keywords

cluster_keywords_pmi = extract_keywords_pmi(texts_df, cluster_df)
print(cluster_keywords_pmi)

{'cluster0': ['______________________', '____________________________', 'aalison', 'aaliyah', 'aam', 'aardman', 'abcd', 'abcde', 'abcdlov', 'abcdlove'], 'cluster1': ['ablcubrysiusvzjytsllmgoob', 'allrisepimver', 'anyways', 'astros', 'banilaco', 'bape', 'bbtaocip', 'boiip', 'bounce', 'boxlogo'], 'cluster2': ['_______________________________________', 'aaaa', 'abajo', 'abnorm', 'abnormal', 'abrazo', 'acetylcysteine', 'achil', 'achilles', 'actndaa'], 'cluster3': ['akasakibunny', 'amorous', 'anakin', 'applevaundy', 'atgwxeecurkfaarjq', 'badale', 'badalee', 'bbin', 'besot', 'besotted'], 'cluster4': ['ab型', 'aespanext', 'ay', 'baebae', 'barcode', 'beark', 'bndsakuraaaa', 'bobbi', 'chianglulu', 'chili'], 'cluster5': ['allegi', 'allegiant', 'annnbrowni', 'annnbrownie', 'basse', 'bdm', 'beau', 'blackbrown', 'bojack', 'climbs'], 'cluster6': ['adelesomeon', 'adelesomeone', 'aekoopa', 'agus', 'airtist', 'algernon', 'allwillbewel', 'allwillbewell', 'ambitiouspassion', 'angelababy'], 'cluster7': ['n

In [None]:
def save_cluster_keywords_to_csv(cluster_keywords, file_path):
    with open(file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['cluster_name', 'keywords'])
        for cluster_name, keywords in cluster_keywords.items():
            writer.writerow([cluster_name, ', '.join(keywords)])

save_cluster_keywords_to_csv(cluster_keywords_tfidf, '/content/drive/MyDrive/IRTM_final_project/result/Kmeans/cluster_keywords_tfidf.csv')
save_cluster_keywords_to_csv(cluster_keywords_word_freq, '/content/drive/MyDrive/IRTM_final_project/result/Kmeans/cluster_keywords_word_freq.csv')
save_cluster_keywords_to_csv(cluster_keywords_chi2, '/content/drive/MyDrive/IRTM_final_project/result/Kmeans/cluster_keywords_chi2.csv')
save_cluster_keywords_to_csv(cluster_keywords_pmi, '/content/drive/MyDrive/IRTM_final_project/result/Kmeans/cluster_keywords_pmi.csv')

In [None]:
from collections import Counter

def count_keyword_occurrences(cluster_keywords, texts_df, cluster_df):
    keyword_counts = {}

    for cluster_name, keywords in cluster_keywords.items():
        cluster_members = cluster_df[cluster_df['cluster_name'] == cluster_name]['members'].iloc[0]
        cluster_texts = texts_df[texts_df['id'].isin(cluster_members)]['text']

        keyword_count = Counter()
        for text in cluster_texts:
          if not isinstance(text, str):
              continue
          for keyword in keywords:
              keyword_count[keyword] += text.count(keyword)

        keyword_counts[cluster_name] = dict(keyword_count)

    return keyword_counts

# # 使用方法
keyword_occurrences_tfidf = count_keyword_occurrences(cluster_keywords_tfidf, texts_df, cluster_df)
keyword_occurrences_freq = count_keyword_occurrences(cluster_keywords_word_freq, texts_df, cluster_df)
keyword_occurrences_chi2 = count_keyword_occurrences(cluster_keywords_chi2, texts_df, cluster_df)
keyword_occurrences_pmi = count_keyword_occurrences(cluster_keywords_pmi, texts_df, cluster_df)
print(keyword_occurrences_pmi)


{'cluster0': {'的': 29166, '我': 12294, '了': 9941, '是': 11390, '在': 7355, '你': 5776, '有': 8520, '都': 3831, '也': 3258, '好': 7619}, 'cluster1': {'真的': 3571, '的': 6674, '我': 2220, '了': 1627, '是': 1762, '好': 1667, '很': 1066, '在': 860, '都': 653, '你': 689}, 'cluster2': {'的': 38490, '我': 18042, '是': 17398, '了': 9846, '在': 9801, '有': 11856, '你': 6432, '都': 5480, '也': 5124, '可以': 4358}, 'cluster3': {'的': 2688, '喜歡': 1742, '我': 1442, '你': 689, '是': 1044, '好': 866, '了': 541, '很': 565, '人': 646, '都': 369}, 'cluster4': {'的': 2450, '什麼': 2024, '我': 1740, '是': 1543, '了': 891, '你': 836, '在': 819, '為': 821, '都': 519, '有': 922}, 'cluster5': {'的': 2506, '我': 1357, '不是': 983, '是': 2226, '有人': 727, '了': 725, '你': 676, '在': 590, '有': 1423, '也': 379}, 'cluster6': {'的': 11762, '自己': 4726, '我': 4359, '你': 3018, '是': 5021, '了': 2460, '在': 2480, '有': 3251, '都': 1564, '人': 3505}, 'cluster7': {}}


In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 字频统计字典
word_freq = {'的': 11762, '自己': 4726, '我': 4359, '你': 3018, '是': 5021, '了': 2460, '在': 2480, '有': 3251, '都': 1564, '人': 3505}

# 生成文字云
wordcloud = WordCloud(
    font_path='simhei.ttf',  # 如果生成中文字云，确保指定支持中文的字体文件
    width=800,
    height=400,
    background_color='white',
    max_words=100,
).generate_from_frequencies(word_freq)

# 显示文字云
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # 关闭坐标轴
plt.show()


OSError: cannot open resource

In [None]:
import os

font_path = 'C:/Windows/Fonts/kaiu.ttf'  # 替换为你的实际字体路径
if os.path.exists(font_path):
    print("Font file exists!")
else:
    print("Font file not found. Check the path.")

Font file not found. Check the path.
