In [1]:
!pip install bertopic

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
#pip install matplotlib

In [3]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer

  from .autonotebook import tqdm as notebook_tqdm
2024-07-09 18:04:17.371403: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import csv

csv_file = 'papers_all_tags.csv' # 替换为您的CSV文件路径
column_name = 'title' # 替换为您要读取的列名
year_limit = 2021 # 替换为年份限制

listdata = []

with open(csv_file, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        year = int(row['year'])
        if year < year_limit:
            listdata.append(row[column_name])



In [5]:
print(listdata[:2])

['A uridine kinase-deficient mutant of 3T3 and a selective method for cells containing the enzyme', 'spoT, a new genetic locus involved in the stringent response in E. coli']


In [6]:
sentence_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
import torch
if torch.cuda.is_available():
   sentence_model = sentence_model.to(torch.device("cuda"))

embeddings = sentence_model.encode(listdata, show_progress_bar=True)

Batches: 100%|██████████| 476/476 [00:09<00:00, 48.94it/s]


In [7]:
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0, metric='cosine')


In [8]:
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(2, 4))


In [10]:
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)


In [11]:
from bertopic.representation import KeyBERTInspired
# KeyBERT
keybert_model = KeyBERTInspired()
representation_model = {
    "KeyBERT": keybert_model
}

In [12]:
topic_model = BERTopic(
    embedding_model=sentence_model,    # Step 1 - Extract embeddings
    umap_model=umap_model,              # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
    calculate_probabilities=True,
    representation_model=representation_model,
    n_gram_range=(1, 3)
)


In [13]:
topics, probabilities = topic_model.fit_transform(listdata, embeddings)

In [14]:
topic_model.save("r5_cell_title")



In [15]:
#减少异常值（-1）
# Reduce outliers with pre-calculate embeddings instead
new_topics = topic_model.reduce_outliers(listdata, topics, strategy="embeddings", embeddings=embeddings)

In [16]:
topic_model.update_topics(listdata, topics=new_topics)



In [17]:
topic_model.save("r5_cell_title")



In [18]:
load_model = BERTopic.load("r5_cell_title")

In [19]:
load_model.get_topic_freq()

Unnamed: 0,Topic,Count
16,0,761
6,1,571
17,2,547
24,5,455
47,3,415
...,...,...
39,94,45
72,92,43
103,95,43
86,96,41


In [20]:
#topic_model.visualize_barchart()

In [21]:
#load_model.visualize_topics().write_html('modelviz.html')

In [22]:
import csv
from bertopic import BERTopic

# 加载模型
topic_model = BERTopic.load("r5_cell_title")

# 读取数据
csv_file = 'papers_all_tags.csv'  # 替换为您的CSV文件路径
column_name = 'title'  # 替换为您要读取的列名
year_limit = 2020  # 替换为年份限制

newdata = []

with open(csv_file, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        year = int(row['year'])
        if year > year_limit:
            newdata.append(row[column_name])

In [24]:
import csv
# 求出主题相似度并保存到 CSV 文件
with open('r5_similar_topics.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Query', 'Similar Topic', 'Similarity'])
    for query in newdata:
        similar_topics, similarity = topic_model.find_topics(query, top_n=104)
        for topic, sim in zip(similar_topics, similarity):
            writer.writerow([query, topic, sim])

print("Results saved to similar_topics.csv")

Results saved to similar_topics.csv


In [25]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_topic_similarities(topic_model):
    """计算所有主题之间的相似度

    参数：
        topic_model: 已拟合的 BERTopic 实例。

    返回：
        similarity_matrix: 相似度矩阵，其中元素 [i, j] 表示主题 i 和主题 j 之间的相似度。

    示例：
    ```python
    topic_similarities = get_topic_similarities(topic_model)
    print(topic_similarities)
    ```
    """
    if topic_model.topic_embeddings_ is not None:
        embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:]
    else:
        embeddings = topic_model.c_tf_idf_[topic_model._outliers:]

    similarity_matrix = cosine_similarity(embeddings)

    return similarity_matrix

In [26]:
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity

def save_topic_similarities_to_csv(topic_model, filepath):
    """将所有主题之间的相似度保存到 CSV 文件

    参数：
        topic_model: 已拟合的 BERTopic 实例。
        filepath: 保存 CSV 文件的路径。

    示例：
    ```python
    save_topic_similarities_to_csv(topic_model, 'topic_similarities.csv')
    ```
    """
    if topic_model.topic_embeddings_ is not None:
        embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:]
    else:
        embeddings = topic_model.c_tf_idf_[topic_model._outliers:]

    similarity_matrix = cosine_similarity(embeddings)

    with open(filepath, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Topic'] + list(range(len(similarity_matrix))))
        for i, row in enumerate(similarity_matrix):
            writer.writerow([i] + list(row))

In [27]:
save_topic_similarities_to_csv(topic_model, filepath='r5_topic_sim.csv')

In [28]:
import numpy as np


def select_topics(doc_topic_similarity, topic_similarity, n):
    selected_topics = []
    remaining_topics = list(topic_similarity.keys())

    max_similarity = float('-inf')
    selected_topic = None

    # 选择与文档相关性最大的主题作为第一个主题
    for topic in remaining_topics:
        if topic in doc_topic_similarity:
            similarity = doc_topic_similarity[topic]
            if similarity > max_similarity:
                max_similarity = similarity
                selected_topic = topic

    if selected_topic is not None:
        selected_topics.append(selected_topic)
        remaining_topics.remove(selected_topic)

    # 选择余下的主题
    while len(selected_topics) < n and remaining_topics:
        max_difference = float('-inf')
        selected_topic = None

        for topic in remaining_topics:
            avg_topic_similarity = np.mean([topic_similarity[topic][selected_topic] for selected_topic in selected_topics if selected_topic in topic_similarity[topic]])
            difference = doc_topic_similarity[topic] - avg_topic_similarity
            if difference > max_difference:
                max_difference = difference
                selected_topic = topic

        if selected_topic is None:
            break

        selected_topics.append(selected_topic)
        remaining_topics.remove(selected_topic)

    return selected_topics



In [29]:
import csv

doc_topic_similarity = {}

with open('r5_similar_topics.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        document = row['Query']
        topic = row['Similar Topic']
        similarity = float(row['Similarity'])

        if document not in doc_topic_similarity:
            doc_topic_similarity[document] = {}

        doc_topic_similarity[document][topic] = similarity



In [30]:
import csv

topic_similarity = {}

with open('r5_topic_sim.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        topic1 = row['Topic']
        topic_similarities = {}

        for key, value in row.items():
            if key != 'Topic':
                topic2 = key
                similarity = float(value)
                topic_similarities[topic2] = similarity

        topic_similarity[topic1] = topic_similarities


In [31]:
# 样本数据
doc_topic_similarity = doc_topic_similarity
topic_similarity = topic_similarity
n = 5  # 要选择的主题数量


# 为每个文档选择主题
selected_topics = {}
for document, topic_similarity_scores in doc_topic_similarity.items():
    selected_topics[document] = select_topics(topic_similarity_scores, topic_similarity, n)

# 保存结果到CSV文件
with open('r5_selected_topics.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['文档', '主题集合', '主题集合与文档的相关性'])
    for document, topics in selected_topics.items():
        similarity_values = [doc_topic_similarity[document][topic] for topic in topics]
        writer.writerow([document, topics, similarity_values])

print("结果已保存到selected_topics.csv文件中。")

结果已保存到selected_topics.csv文件中。


In [None]:
topic_model.visualize_topics().write_html('review_cell_modelviz.html')