In [None]:
!pip install pandas==2.2.2 numpy==1.24.4 matplotlib==3.10.0 seaborn==0.13.2 gensim==4.3.3 pyLDAvis==3.4.1 d3blocks==1.4.11

In [None]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from gensim import models
from gensim.models import CoherenceModel
from gensim import corpora
from gensim.models import LdaModel
from scipy.stats import entropy
import pyLDAvis.gensim
from d3blocks import D3Blocks

Import preprocessed files

In [None]:
with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

with open('bow_corpus.pkl', 'rb') as f:
    bow_corpus = pickle.load(f)

with open('dictionary.dict', 'rb') as f:
    dictionary = pickle.load(f)

Plot mean tokens per abstract

In [None]:
num_tokens_per_abstract = [len(doc) for doc in preprocessed_docs]
plt.figure(figsize=(10, 6))
sns.violinplot(y=num_tokens_per_abstract, color='skyblue', inner='point')
plt.title('Distribution of Mean Number of Tokens per Abstract')
plt.ylabel('Number of Tokens')
dpi_value = 600
plt.savefig('violin_plot.png', dpi=dpi_value)
plt.show()

Run coherence model to calculate optimal number of topics for LDA topic modelling

In [None]:
# Choose a range of potential number of topics
min_topics = 5
max_topics = 50
step_size = 5
topics_range = range(min_topics, max_topics, step_size)

# Run LDA for each number of topics and calculate coherence scores
coherence_scores = []
for num_topics in topics_range:
    lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=1)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    coherence_scores.append(coherence_score)

# Print the number of topics with the highest coherence score
optimal_num_topics = topics_range[coherence_scores.index(max(coherence_scores))]
print("Optimal number of topics:", optimal_num_topics)

LDA topic model training and visualization with pyLDAvis

In [None]:
# Train the LDA model
lda_model = LdaModel(bow_corpus, num_topics=30, id2word=dictionary, passes=20)
for idx, topic in lda_model.print_topics():
    print(f'Topic: {idx}')
    print(f'Top words: {topic}')
    print()

# Save the model
with open('lda_model.pkl', 'wb') as f:
    pickle.dump(lda_model, f)

# Visualizing the LDA model using pyLDAvis and save as html
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(vis)
pyLDAvis.save_html(vis, 'lda_visualization.html')

TF-IDF model training and top TF-IDF words per topic

In [None]:
# Train TF-IDF model
tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

# Get the topic distribution for each document
doc_topics = [lda_model.get_document_topics(doc) for doc in bow_corpus]

# Initialize TF-IDF scores for each topic
topic_tfidf_scores = np.zeros((lda_model.num_topics, len(dictionary)))

# Calculate TF-IDF scores for each word in each topic
for doc, doc_topic in zip(tfidf_corpus, doc_topics):
    for topic, score in doc_topic:
        for word_id, word_score in doc:
            topic_tfidf_scores[topic, word_id] += score * word_score

# Rank and print top 20 words by TF-IDF scores within each topic
N = 20
top_words_per_topic = []
for topic_idx in range(lda_model.num_topics):
    top_word_indices = np.argsort(topic_tfidf_scores[topic_idx])[::-1][:N]
    top_words = [(dictionary[word_id], topic_tfidf_scores[topic_idx][word_id]) for word_id in top_word_indices]
    top_words_per_topic.append(top_words)
for topic_idx, top_words in enumerate(top_words_per_topic):
    print(f'Topic {topic_idx}:')
    for word, score in top_words:
        print(f'{word}: {score}')
    print()

Evolution of topic proportions, calculate 5-year moving average to smooth the data

In [None]:
# Load the preprocessed_docs.csv that contains the year of publication data in 'Year' column and convert to numeric
file_name = 'preprocessed_docs.csv'
df = pd.read_csv(file_name, low_memory=False)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df.reset_index(drop=True, inplace=True)

# Replace NaN years with 1 as a placeholder
df['Year'] = df['Year'].fillna(1).astype(int)

In [None]:
num_docs = len(preprocessed_docs)
all_years = sorted(df['Year'].unique())

# Calculate proportion of each topic in each year iterating over each document
topic_proportions = pd.DataFrame(columns=range(lda_model.num_topics), index=all_years)
for index, doc_topics in enumerate(lda_model.get_document_topics(bow_corpus[:num_docs], minimum_probability=0)):
    year = int(df.iloc[index]['Year'])
    for topic, prob in doc_topics:
        topic_proportions.at[year, topic] = topic_proportions.at[year, topic] + prob if not pd.isnull(topic_proportions.at[year, topic]) else prob

# Normalize to get proportions
topic_proportions = topic_proportions.div(topic_proportions.sum(axis=1), axis=0)

# Filter only 1975-2023 for plotting
topic_proportions = topic_proportions.loc[1975:2023]

In [None]:
# Save topic_proportions df to a CSV file
topic_proportions.to_csv('topic_proportions.csv', index_label='Year')

In [None]:
# Define original topic names in the model's order
original_topic_names = [
    "Cell signaling", "Development", "CNS diseases", "Cardiovascular",
    "Age-related decline", "Risk factors", "Cell biology", "Gender",
    "Muscle", "Oxidative stress", "Bone", "Therapeutics",
    "Metabolism", "Neural tissue", "Clinics", "Healthcare",
    "General terms", "Brain structure", "Psychosocial", "Rodent studies",
    "Cancer", "Physical activity", "Demography", "Liver and kidney",
    "Genetics", "Analytics", "Cognition", "Physics",
    "Skin", "Clinical tests"
]

# Define the desired order of topic names
desired_order = [
    "General terms", "Healthcare", "Cell biology", "Genetics", "Analytics", "Cell signaling",
    "Demography", "Clinical tests", "Age-related decline", "Rodent studies", "Clinics",
    "Psychosocial", "Oxidative stress", "Physics", "Therapeutics", "Risk factors",
    "Development", "Cognition", "CNS diseases", "Skin", "Neural tissue", "Brain structure",
    "Cancer", "Metabolism", "Physical activity", "Cardiovascular", "Gender", "Muscle",
    "Bone", "Liver and kidney"
]

# Create a mapping from original topic indices to the desired order and plot a 6x5 grid
topic_mapping = {name: i for i, name in enumerate(original_topic_names)}
desired_indices = [topic_mapping[name] for name in desired_order]
fig, axes = plt.subplots(6, 5, figsize=(26, 24))
axes = axes.flatten()
for i, topic_index in enumerate(desired_indices):
    moving_avg = topic_proportions[topic_index].rolling(window=5, min_periods=1).mean()
    axes[i].plot(topic_proportions.index, moving_avg, label=f'Topic {topic_index}', color='navy')
    axes[i].set_title(desired_order[i])
    axes[i].set_xlabel('Year')
    axes[i].set_ylabel('Proportion')
    axes[i].grid(False)
    axes[i].set_xticks([1975, 1985, 1995, 2005, 2015, 2023])
plt.tight_layout()
plt.savefig('topic_evolution.pdf')
plt.show()

Shannon entropy evolution (10-year moving average)

In [None]:
shannon_entropy_per_document = []
years = []

for index, doc_topics in enumerate(lda_model.get_document_topics(bow_corpus[:num_docs], minimum_probability=0)):
    year = df.iloc[index]['Year']
    years.append(year)
    topic_probabilities = [prob for _, prob in doc_topics]
    shannon_entropy = entropy(topic_probabilities, base=2)  # Using base 2 for binary entropy
    shannon_entropy_per_document.append(shannon_entropy)

entropy_data = pd.DataFrame({'Year': years, 'Shannon_Entropy': shannon_entropy_per_document})
entropy_by_year = entropy_data.groupby('Year')['Shannon_Entropy'].mean()

In [None]:
# Plotting

rolling_entropy = entropy_by_year.rolling(window=10, center=True).mean()
rolling_entropy_filtered = rolling_entropy[(rolling_entropy.index >= 1975) & (rolling_entropy.index <= 2023)]

plt.figure(figsize=(10, 6))
plt.plot(rolling_entropy_filtered.index, rolling_entropy_filtered.values, linestyle='-', color='#006d77', linewidth=2, label='10-Year Moving Average')
plt.xlabel('Year')
plt.ylabel('Mean Shannon Entropy')
plt.title('Mean Shannon Entropy of Topic Distributions Over the Years')
plt.xticks(np.arange(1975, 2024, 5))
plt.legend()
plt.savefig('mean_shannon_entropy_over_years.pdf', dpi=600)
plt.show()

Topic co-occurence matrix generation and chord diagram plotting

In [None]:
# Calculate topic distribution for each document and generate a co-occurrence matrix
topic_distributions = [lda_model.get_document_topics(doc) for doc in bow_corpus]
num_topics = lda_model.num_topics
co_occurrence_matrix = np.zeros((num_topics, num_topics))

for dist in topic_distributions:
    # Sort topics by probability and take the top 5
    top_topics = sorted(dist, key=lambda x: x[1], reverse=True)[:5]
    top_topic_indices = [topic[0] for topic in top_topics]
    for i in range(len(top_topic_indices)):
        for j in range(i + 1, len(top_topic_indices)):
            co_occurrence_matrix[top_topic_indices[i], top_topic_indices[j]] += 1
            co_occurrence_matrix[top_topic_indices[j], top_topic_indices[i]] += 1

# Normalize the co-occurrence matrix
co_occurrence_matrix /= len(topic_distributions)

In [None]:
# Plotting using D3Blocks

topic_names = [
    "Cell signaling",
    "Development",
    "CNS diseases",
    "Cardiovascular",
    "Age-related decline",
    "Risk factors",
    "Cell biology",
    "Gender",
    "Muscle",
    "Oxidative stress",
    "Bone",
    "Therapeutics",
    "Metabolism",
    "Neural tissue",
    "Clinics",
    "Healthcare",
    "General terms",
    "Brain structure",
    "Psychosocial",
    "Rodent studies",
    "Cancer",
    "Physical activity",
    "Demography",
    "Liver and kidney",
    "Genetics",
    "Analytics",
    "Cognition",
    "Physics",
    "Skin",
    "Clinical tests"
]

labels = [topic_names[i] if i < len(topic_names) else f'Topic {i}' for i in range(num_topics)]
links = []
threshold = 0.033  # Adjust the threshold as needed
for i in range(co_occurrence_matrix.shape[0]):
    for j in range(i + 1, co_occurrence_matrix.shape[1]):  # Ensure i < j to avoid duplicates
        if co_occurrence_matrix[i, j] > threshold:
            links.append((labels[i], labels[j], co_occurrence_matrix[i, j]))


df = pd.DataFrame(links, columns=['source', 'target', 'weight'])
d3 = D3Blocks(chart='Chord', frame=False)
d3.chord(df, arrowhead=0, title='Topic Co-occurrence Chord Diagram', cmap='tab20', figsize=(800, 800), filepath='/content/chord_diagram.html')