In [None]:
!pip install pandas numpy seaborn matplotlib gensim

In [None]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from gensim import models
from gensim import corpora
from gensim.models import LdaModel

Import preprocessed files

In [None]:
with open('lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

with open('bow_corpus.pkl', 'rb') as f:
    bow_corpus = pickle.load(f)

with open('dictionary.dict', 'rb') as f:
    dictionary = pickle.load(f)

Generate a co-occurrence matrix and plot a heatmap

In [None]:
# Initialize the co-occurrence matrix
topic_distributions = [lda_model.get_document_topics(doc) for doc in bow_corpus]
num_topics = lda_model.num_topics
co_occurrence_matrix = np.zeros((num_topics, num_topics))

for dist in topic_distributions:
    topic_indices = [topic[0] for topic in dist]
    for i in range(len(topic_indices)):
        for j in range(i + 1, len(topic_indices)):
            co_occurrence_matrix[topic_indices[i], topic_indices[j]] += 1
            co_occurrence_matrix[topic_indices[j], topic_indices[i]] += 1

# Normalize the co-occurrence matrix
co_occurrence_matrix /= len(topic_distributions)

In [None]:
# Define the order of topics and their corresponding names
topic_mapping = {
    0: "Cell signaling", 1: "Development", 2: "CNS diseases", 3: "Cardiovascular",
    4: "Age-related decline", 5: "Risk factors", 6: "Cell biology", 7: "Gender",
    8: "Muscle", 9: "Oxidative stress", 10: "Bone", 11: "Therapeutics", 12: "Metabolism",
    13: "Neural tissue", 14: "Clinics", 15: "Healthcare", 16: "General terms",
    17: "Brain structure", 18: "Psychosocial", 19: "Rodent studies", 20: "Cancer",
    21: "Physical activity", 22: "Demography", 23: "Liver and kidney", 24: "Genetics",
    25: "Analytics", 26: "Cognition", 27: "Physics", 28: "Skin", 29: "Clinical tests"
}

# Define the order of topics as specified
topic_order = [
    "General terms", "Healthcare", "Cell biology", "Genetics", "Analytics",
    "Cell signaling", "Demography", "Clinical tests", "Age-related decline",
    "Rodent studies", "Clinics", "Psychosocial", "Oxidative stress", "Physics",
    "Therapeutics", "Risk factors", "Development", "Cognition", "CNS diseases", "Skin",
    "Neural tissue", "Brain structure", "Cancer", "Metabolism", "Physical activity",
    "Cardiovascular", "Gender", "Muscle", "Bone", "Liver and kidney"
]

# Reorder the co-occurrence matrix according to topic_order
ordered_matrix = np.zeros((len(topic_order), len(topic_order)))
for i, topic_i in enumerate(topic_order):
    for j, topic_j in enumerate(topic_order):
        topic_i_idx = list(topic_mapping.keys())[list(topic_mapping.values()).index(topic_i)]
        topic_j_idx = list(topic_mapping.keys())[list(topic_mapping.values()).index(topic_j)]
        ordered_matrix[i, j] = co_occurrence_matrix[topic_i_idx, topic_j_idx]

# Plotting the heatmap
plt.figure(figsize=(18, 16))
sns.heatmap(ordered_matrix, annot=False, fmt=".2f", cmap="RdBu_r", xticklabels=topic_order, yticklabels=topic_order)
plt.title('Topic Co-occurrence Heatmap')
plt.xlabel('Topics')
plt.ylabel('Topics')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('heatmap_cooccurrence.pdf')
plt.show()

Plot top and bottom 5 co-occurrences between topics (normalized)

In [None]:
def top_bottom_co_occurrences(co_occurrence_matrix, num_topics, top_n=5):
    top_bottom_co_occurrences = {}
    normalized_matrix = co_occurrence_matrix / co_occurrence_matrix.max(axis=1, keepdims=True)

    # Set the diagonal values to 0.5 after normalization
    np.fill_diagonal(normalized_matrix, 0.5)

    for topic_index in range(num_topics):
        co_occurrences = normalized_matrix[topic_index, :].copy()
        top_indices = np.argsort(co_occurrences)[::-1][:top_n]
        bottom_indices = np.argsort(co_occurrences)[:top_n]
        top_bottom_co_occurrences[topic_index] = (top_indices, bottom_indices)

    return top_bottom_co_occurrences

top_bottom_co_occurrences = top_bottom_co_occurrences(co_occurrence_matrix, num_topics)

color_top = (1, 0, 0, 0.6)
color_bottom = (0, 0, 1, 0.6)

# Plot each topic individually with top 5 and bottom 5 co-occurrences
for topic_index, (top_indices, bottom_indices) in top_bottom_co_occurrences.items():
    topic_name = topic_mapping[topic_index]
    fig, ax = plt.subplots(figsize=(10, 6))
    co_topic_names = [topic_mapping[idx] for idx in bottom_indices] + [topic_mapping[idx] for idx in top_indices[::-1]]
    bottom_values = co_occurrence_matrix[topic_index, bottom_indices] / co_occurrence_matrix[topic_index, :].max()
    top_values = co_occurrence_matrix[topic_index, top_indices[::-1]] / co_occurrence_matrix[topic_index, :].max()
    all_values = np.concatenate((bottom_values, top_values))
    colors = [color_bottom] * len(bottom_indices) + [color_top] * len(top_indices)
    ax.barh(range(len(all_values)), all_values, color=colors, edgecolor='none')
    ax.set_yticks(range(len(all_values)))
    ax.set_yticklabels(co_topic_names, fontsize=18)
    ax.set_title(topic_name, fontsize=24, loc='center', x=0.4)  # Set the title to only the name of the topic
    ax.xaxis.set_visible(False)
    ax.tick_params(axis='y', which='both', length=0)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)  # This removes the y-axis line
    ax.spines['bottom'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f"top_bottom_cooccur_{topic_name}.pdf")
    plt.show()