In [None]:
!pip install numpy pandas scikit-learn tqdm scipy matplotlib seaborn

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FuncFormatter
import seaborn as sns

Load preprocessed files (all documents)

In [None]:
with open('umap_result.pkl', 'rb') as f:
    umap_result = pickle.load(f)

with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

file_name = 'leiden_labels_all.csv'
leiden_labels_all = pd.read_csv(file_name)

Create a matrix of top cluster-specific words presence in other clusters based on TF-IDF scoring (all documents)

In [None]:
# Define cluster names
cluster_names = {
    0: 'Dementia', 1: 'Genetics', 2: 'Geriatrics', 3: 'Gender', 4: 'Cell signaling & stem cells',
    5: 'Exercise', 6: 'Diabetes', 7: 'Hormones', 8: 'Nutrition', 9: 'Statistics',
    10: 'Broad aging terminology', 11: 'Depression & psychology', 12: 'Immunology',
    13: 'Brain structure', 14: 'Memory & learning', 15: 'Healthcare', 16: 'Oxidative stress',
    17: 'Animal studies', 18: 'Muscle', 19: 'DNA damage', 20: 'Comparative studies',
    21: "Alzheimer's", 22: 'Vascular', 23: 'Skin', 24: 'Telomeres', 25: 'Cancer',
    26: 'Cell cycle & senescence', 27: 'Mitochondria', 28: 'Sleep', 29: 'Mouse studies',
    30: 'Bone', 31: 'Physics'
}

filtered_texts = [' '.join(doc) for doc in preprocessed_docs]

# Create TF-IDF vectorizer and transform the filtered texts
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

# Map documents to clusters using Leiden labels
umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
umap_df['cluster'] = leiden_labels_all['leiden']

# Calculate the average TF-IDF vector per cluster
cluster_tfidf = []
for cluster_id in sorted(umap_df['cluster'].unique()):
    cluster_docs = tfidf_matrix[umap_df['cluster'] == cluster_id]
    cluster_mean_tfidf = cluster_docs.mean(axis=0)
    cluster_tfidf.append(cluster_mean_tfidf)

cluster_tfidf_matrix = np.array(cluster_tfidf).squeeze()
cluster_tfidf_df = pd.DataFrame(cluster_tfidf_matrix, index=[cluster_names[i] for i in sorted(cluster_names.keys())],
                                columns=tfidf_vectorizer.get_feature_names_out())

# Identify top 20 differential words for each cluster (words with the highest TF-IDF scores)
top_words_per_cluster = {}
num_top_words = 20
for cluster in cluster_tfidf_df.index:
    top_words = cluster_tfidf_df.loc[cluster].nlargest(num_top_words).index
    top_words_per_cluster[cluster] = top_words

# Calculate the TF-IDF presence of top words from each cluster in all other clusters
underrepresentation_matrix = pd.DataFrame(0, index=cluster_tfidf_df.index, columns=cluster_tfidf_df.index)
for cluster in top_words_per_cluster:
    for other_cluster in cluster_tfidf_df.index:
        if cluster != other_cluster:
            mean_tfidf_score = cluster_tfidf_df.loc[other_cluster, top_words_per_cluster[cluster]].mean()
            underrepresentation_matrix.loc[cluster, other_cluster] = mean_tfidf_score

# Visualize the matrix
doc_counts = umap_df['cluster'].value_counts().sort_values(ascending=False)
sorted_cluster_labels = [cluster_names[c] for c in doc_counts.index if c in cluster_names]
underrepresentation_matrix_sorted = underrepresentation_matrix.loc[sorted_cluster_labels, sorted_cluster_labels]

plt.figure(figsize=(12, 10))
sns.heatmap(
    underrepresentation_matrix_sorted,
    cmap="RdBu_r",
    annot=False,
    fmt=".2f",
    cbar_kws={'label': 'Mean TF-IDF Score of Top Words'},
)

plt.title("Top Cluster-Specific Words in Other Clusters", fontsize=20)
plt.xlabel("Target Clusters", fontsize=20)
plt.ylabel("Top Words From Clusters", fontsize=20)
plt.xticks(rotation=45, ha="right", fontsize=16)
plt.yticks(rotation=0, fontsize=16)
plt.tight_layout()
plt.savefig('Top Cluster-Specific Words in Other Clusters (All).pdf', bbox_inches='tight')
plt.show()

Plot top 3 most and least represented relationships (all documents)

In [None]:
def format_ticks(x, pos):
    return f'{x:.2f}'

fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(18, 20))
axes = axes.flatten()

for i, cluster in enumerate(underrepresentation_matrix_sorted.index):
    ax = axes[i]
    other_clusters = underrepresentation_matrix_sorted.loc[cluster].drop(index=cluster)
    most_represented = other_clusters.nlargest(3).sort_values(ascending=True)
    least_represented = other_clusters.nsmallest(3).sort_values()
    combined_representation = pd.concat([least_represented, most_represented])
    colors = ['blue'] * len(least_represented) + ['red'] * len(most_represented)
    ax.barh(combined_representation.index, combined_representation.values, color=colors, alpha=0.6)
    ax.set_yticklabels(combined_representation.index, fontsize=12)
    ax.set_title(cluster, fontsize=16)
    ax.set_xlabel('Mean TF-IDF Score', fontsize=12)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.xaxis.set_major_locator(MaxNLocator(nbins=3))
    ax.xaxis.set_major_formatter(FuncFormatter(format_ticks))
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.subplots_adjust(hspace=1)
plt.suptitle('Most and Least Studied Relationships', fontsize=24, y=1.02)
plt.savefig('Top 3 Most Represented and Underrepresented Relationships (All).pdf', bbox_inches='tight')
plt.show()

Load preprocessed files (BoA documents)

In [None]:
with open('umap_result_biology.pkl', 'rb') as f:
    umap_result_biology = pickle.load(f)

with open('filtered_docs_biology.pkl', 'rb') as f:
    filtered_docs_biology = pickle.load(f)

file_name = 'leiden_labels_biology.csv'
leiden_labels_biology = pd.read_csv(file_name)

Create a matrix of top cluster-specific words presence in other clusters based on TF-IDF scoring (BoA documents)

In [None]:
cluster_names = {
    0: 'GWAS',
    1: 'Gene regulation',
    2: 'Cancer',
    3: 'Epigenetics',
    4: 'Insulin/IGF pathway',
    5: 'Oxidative stress',
    6: 'Stem cells',
    7: 'Mesenchymal stem cells',
    8: 'Mouse models',
    9: 'Biochemistry',
    10: 'Neuroscience',
    11: 'Comparative studies',
    12: 'Skin',
    13: 'Immunology',
    14: 'Clinical genetics',
    15: 'Mitochondria',
    16: 'Protein biology',
    17: 'Senescence',
    18: 'Genomic stability',
    19: "Alzheimer's",
    20: 'Muscle',
    21: 'RNA biology',
    22: 'Receptor biology',
    23: 'Cytokines',
    24: 'Telomeres',
    25: 'Sirtuins & mTOR',
    26: 'Autophagy'
}

filtered_texts = [' '.join(doc) for doc in filtered_docs_biology]

# Create TF-IDF vectorizer and transform the filtered texts
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

# Map documents to clusters using Leiden labels
umap_df = pd.DataFrame(umap_result_biology, columns=['UMAP1', 'UMAP2'])
umap_df['cluster'] = leiden_labels_biology['leiden']

# Calculate the average TF-IDF vector per cluster
cluster_tfidf = []
for cluster_id in sorted(umap_df['cluster'].unique()):
    cluster_docs = tfidf_matrix[umap_df['cluster'] == cluster_id]
    cluster_mean_tfidf = cluster_docs.mean(axis=0)
    cluster_tfidf.append(cluster_mean_tfidf)
cluster_tfidf_matrix = np.array(cluster_tfidf).squeeze()
cluster_tfidf_df = pd.DataFrame(cluster_tfidf_matrix, index=[cluster_names[i] for i in sorted(cluster_names.keys())],
                                columns=tfidf_vectorizer.get_feature_names_out())

# Identify top 20 differential words for each cluster (words with the highest TF-IDF scores)
top_words_per_cluster = {}
num_top_words = 20
for cluster in cluster_tfidf_df.index:
    top_words = cluster_tfidf_df.loc[cluster].nlargest(num_top_words).index
    top_words_per_cluster[cluster] = top_words

# Calculate the TF-IDF presence of top words from each cluster in all other clusters
underrepresentation_matrix = pd.DataFrame(0, index=cluster_tfidf_df.index, columns=cluster_tfidf_df.index)

for cluster in top_words_per_cluster:
    for other_cluster in cluster_tfidf_df.index:
        if cluster != other_cluster:
            mean_tfidf_score = cluster_tfidf_df.loc[other_cluster, top_words_per_cluster[cluster]].mean()
            underrepresentation_matrix.loc[cluster, other_cluster] = mean_tfidf_score

# Visualize the matrix
doc_counts = umap_df['cluster'].value_counts().sort_values(ascending=False)
sorted_cluster_labels = [cluster_names[c] for c in doc_counts.index if c in cluster_names]
underrepresentation_matrix_sorted = underrepresentation_matrix.loc[sorted_cluster_labels, sorted_cluster_labels]

plt.figure(figsize=(16, 12))
sns.heatmap(
    underrepresentation_matrix_sorted,
    cmap="RdBu_r",
    annot=False,
    fmt=".2f",
    cbar_kws={'label': 'Mean TF-IDF Score of Top Words'},
)

plt.title("Top Cluster-Specific Words in Other Clusters (BoA)", fontsize=20)
plt.xlabel("Target BoA Clusters", fontsize=20)
plt.ylabel("Top Words From BoA Clusters", fontsize=20)
plt.xticks(rotation=45, ha="right", fontsize=16)
plt.yticks(rotation=0, fontsize=16)
plt.tight_layout()
plt.savefig('Top Cluster-Specific Words in Other Clusters (BoA).pdf', bbox_inches='tight')
plt.show()

Plot top 3 most and least represented relationships (BoA documents)

In [None]:
def format_ticks(x, pos):
    return f'{x:.2f}'

fig, axes = plt.subplots(nrows=7, ncols=4, figsize=(16, 16))
axes = axes.flatten()

for i, cluster in enumerate(underrepresentation_matrix_sorted.index):
    ax = axes[i]
    other_clusters = underrepresentation_matrix_sorted.loc[cluster].drop(index=cluster)
    most_represented = other_clusters.nlargest(3).sort_values(ascending=True)
    least_represented = other_clusters.nsmallest(3).sort_values()
    combined_representation = pd.concat([least_represented, most_represented])
    colors = ['blue'] * len(least_represented) + ['red'] * len(most_represented)
    ax.barh(combined_representation.index, combined_representation.values, color=colors, alpha=0.6)
    ax.set_yticklabels(combined_representation.index, fontsize=12)
    ax.set_title(cluster, fontsize=16)
    ax.set_xlabel('Mean TF-IDF Score', fontsize=12)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.xaxis.set_major_locator(MaxNLocator(nbins=3))
    ax.xaxis.set_major_formatter(FuncFormatter(format_ticks))

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.subplots_adjust(hspace=1)
plt.suptitle('Most and Least Studied Relationships', fontsize=24, y=1.02)
plt.savefig('Top 3 Most Represented and Underrepresented Relationships (BoA).pdf', bbox_inches='tight')
plt.show()