In [None]:
!pip install numpy pandas scikit-learn tqdm scipy matplotlib seaborn

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FuncFormatter
import seaborn as sns

Load preprocessed files (all documents)

In [None]:
with open('umap_result.pkl', 'rb') as f:
    umap_result = pickle.load(f)

with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

file_name = 'leiden_labels_all.csv'
leiden_labels_all = pd.read_csv(file_name)

Create a matrix of top cluster-specific words presence in other clusters based on TF-IDF scoring (all documents)

In [None]:
# Define cluster names
cluster_names = {
    0: 'Healthcare',
    1: 'Animal studies',
    2: 'Stem cells',
    3: 'Geriatrics',
    4: 'Molecular biology',
    5: 'Statistics',
    6: 'Social',
    7: 'Oxidative stress',
    8: 'Physics',
    9: 'Dementia',
    10: 'Diabetes and nutrition',
    11: 'Genetics',
    12: 'Mouse studies',
    13: 'Brain structure',
    14: 'Risk factors',
    15: 'Cognition',
    16: 'Gender',
    17: 'Muscle',
    18: 'Mitochondria and DNA damage',
    19: 'Cell cycle and senescence',
    20: 'Hormonal changes',
    21: 'Skin',
    22: 'Bone',
    23: 'Neural tissue',
    25: 'Exercise',
    26: 'Cancer',
    27: 'Vascular',
    28: 'Clinical evaluation',
    29: 'Telomeres',
    30: 'Sleep'
}

filtered_texts = [' '.join(doc) for doc in preprocessed_docs]

# Create TF-IDF vectorizer and transform the filtered texts
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

# Map documents to clusters using Leiden labels
umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
umap_df['cluster'] = leiden_labels_all['leiden']

# Calculate the average TF-IDF vector per cluster
cluster_tfidf = []
for cluster_id in sorted(umap_df['cluster'].unique()):
    cluster_docs = tfidf_matrix[umap_df['cluster'] == cluster_id]
    cluster_mean_tfidf = cluster_docs.mean(axis=0)
    cluster_tfidf.append(cluster_mean_tfidf)

cluster_tfidf_matrix = np.array(cluster_tfidf).squeeze()
cluster_tfidf_df = pd.DataFrame(cluster_tfidf_matrix, index=[cluster_names[i] for i in sorted(cluster_names.keys())],
                                columns=tfidf_vectorizer.get_feature_names_out())

# Identify top 20 differential words for each cluster (words with the highest TF-IDF scores)
top_words_per_cluster = {}
num_top_words = 20
for cluster in cluster_tfidf_df.index:
    top_words = cluster_tfidf_df.loc[cluster].nlargest(num_top_words).index
    top_words_per_cluster[cluster] = top_words

# Calculate the TF-IDF presence of top words from each cluster in all other clusters
underrepresentation_matrix = pd.DataFrame(0, index=cluster_tfidf_df.index, columns=cluster_tfidf_df.index)
for cluster in top_words_per_cluster:
    for other_cluster in cluster_tfidf_df.index:
        if cluster != other_cluster:
            mean_tfidf_score = cluster_tfidf_df.loc[other_cluster, top_words_per_cluster[cluster]].mean()
            underrepresentation_matrix.loc[cluster, other_cluster] = mean_tfidf_score

# Visualize the matrix
plt.figure(figsize=(12, 10))
sns.heatmap(
    underrepresentation_matrix,
    cmap="RdBu_r",
    annot=False,
    fmt=".2f",
    cbar_kws={'label': 'Mean TF-IDF Score of Top Words'},
)

plt.title("Top Cluster-Specific Words in Other Clusters", fontsize=20)
plt.xlabel("Clusters", fontsize=16)
plt.ylabel("Clusters (Top Words)", fontsize=16)
plt.xticks(rotation=45, ha="right", fontsize=11)
plt.yticks(rotation=0, fontsize=11)
plt.tight_layout()
plt.savefig('Top Cluster-Specific Words in Other Clusters (All).pdf', bbox_inches='tight')
plt.show()

Plot top 3 most and least represented relationships (all documents)

In [None]:
def format_ticks(x, pos):
    return f'{x:.2f}'

fig, axes = plt.subplots(nrows=6, ncols=5, figsize=(18, 16))
axes = axes.flatten()

for i, cluster in enumerate(underrepresentation_matrix.index):
    ax = axes[i]
    other_clusters = underrepresentation_matrix.loc[cluster].drop(index=cluster)
    most_represented = other_clusters.nlargest(3).sort_values(ascending=True)
    least_represented = other_clusters.nsmallest(3).sort_values()
    combined_representation = pd.concat([least_represented, most_represented])
    colors = ['blue'] * len(least_represented) + ['red'] * len(most_represented)
    ax.barh(combined_representation.index, combined_representation.values, color=colors, alpha=0.6)
    ax.set_title(cluster)
    ax.set_xlabel('Mean TF-IDF Score')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.xaxis.set_major_locator(MaxNLocator(nbins=3))
    ax.xaxis.set_major_formatter(FuncFormatter(format_ticks))
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.subplots_adjust(hspace=0.5)
plt.suptitle('Most and Least Studied Relationships', fontsize=24, y=1.02)
plt.savefig('Top 3 Most Represented and Underrepresented Relationships (All).pdf', bbox_inches='tight')
plt.show()

Load preprocessed files (BoA documents)

In [None]:
with open('umap_result_biology.pkl', 'rb') as f:
    umap_result_biology = pickle.load(f)

with open('filtered_docs_biology.pkl', 'rb') as f:
    filtered_docs_biology = pickle.load(f)

file_name = 'leiden_labels_biology.csv'
leiden_labels_biology = pd.read_csv(file_name)

Create a matrix of top cluster-specific words presence in other clusters based on TF-IDF scoring (BoA documents)

In [None]:
cluster_names = {
    0: 'Cell signaling',
    1: 'Oxidative stress',
    2: 'Senescence',
    3: 'Cell & molecular biology',
    4: 'Immunology',
    5: 'Mitochondria',
    6: 'Alzheimer\'s',
    7: 'Genetics (clinical)',
    8: 'Stem cells',
    9: 'Cancer',
    10: 'Protein biology',
    11: 'Genetics (biology)',
    12: 'Growth factors and cytokines',
    13: 'Neuroscience',
    14: 'Genomic stability',
    15: 'Mesenchymal stem cells',
    16: 'Muscle',
    17: 'Telomeres',
    18: 'RNA biology',
    19: 'Epigenetics',
    20: 'Autophagy',
    21: 'Sirtuins, mTOR and caloric restriction',
    22: 'Skin'
}

filtered_texts = [' '.join(doc) for doc in filtered_docs_biology]

# Create TF-IDF vectorizer and transform the filtered texts
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

# Map documents to clusters using Leiden labels
umap_df = pd.DataFrame(umap_result_biology, columns=['UMAP1', 'UMAP2'])
umap_df['cluster'] = leiden_labels_biology['leiden']

# Calculate the average TF-IDF vector per cluster
cluster_tfidf = []
for cluster_id in sorted(umap_df['cluster'].unique()):
    cluster_docs = tfidf_matrix[umap_df['cluster'] == cluster_id]
    cluster_mean_tfidf = cluster_docs.mean(axis=0)
    cluster_tfidf.append(cluster_mean_tfidf)
cluster_tfidf_matrix = np.array(cluster_tfidf).squeeze()
cluster_tfidf_df = pd.DataFrame(cluster_tfidf_matrix, index=[cluster_names[i] for i in sorted(cluster_names.keys())],
                                columns=tfidf_vectorizer.get_feature_names_out())

# Identify top 20 differential words for each cluster (words with the highest TF-IDF scores)
top_words_per_cluster = {}
num_top_words = 20
for cluster in cluster_tfidf_df.index:
    top_words = cluster_tfidf_df.loc[cluster].nlargest(num_top_words).index
    top_words_per_cluster[cluster] = top_words

# Calculate the TF-IDF presence of top words from each cluster in all other clusters
underrepresentation_matrix = pd.DataFrame(0, index=cluster_tfidf_df.index, columns=cluster_tfidf_df.index)

for cluster in top_words_per_cluster:
    for other_cluster in cluster_tfidf_df.index:
        if cluster != other_cluster:
            mean_tfidf_score = cluster_tfidf_df.loc[other_cluster, top_words_per_cluster[cluster]].mean()
            underrepresentation_matrix.loc[cluster, other_cluster] = mean_tfidf_score

# Visualize the matrix
plt.figure(figsize=(12, 10))
sns.heatmap(
    underrepresentation_matrix,
    cmap="RdBu_r",
    annot=False,
    fmt=".2f",
    cbar_kws={'label': 'Mean TF-IDF Score of Top Words'},
)

plt.title("Top Cluster-Specific Words in Other Clusters", fontsize=20)
plt.xlabel("Clusters", fontsize=16)
plt.ylabel("Clusters (Top Words)", fontsize=16)
plt.xticks(rotation=45, ha="right", fontsize=11)
plt.yticks(rotation=0, fontsize=11)
plt.tight_layout()
plt.savefig('Top Cluster-Specific Words in Other Clusters (BoA).pdf', bbox_inches='tight')
plt.show()

Plot top 3 most and least represented relationships (BoA documents)

In [None]:
def format_ticks(x, pos):
    return f'{x:.2f}'

fig, axes = plt.subplots(nrows=6, ncols=4, figsize=(16, 16))
axes = axes.flatten()

for i, cluster in enumerate(underrepresentation_matrix.index):
    ax = axes[i]

    other_clusters = underrepresentation_matrix.loc[cluster].drop(index=cluster)
    most_represented = other_clusters.nlargest(3).sort_values(ascending=True)
    least_represented = other_clusters.nsmallest(3).sort_values()
    combined_representation = pd.concat([least_represented, most_represented])
    colors = ['blue'] * len(least_represented) + ['red'] * len(most_represented)
    ax.barh(combined_representation.index, combined_representation.values, color=colors, alpha=0.6)

    ax.set_title(cluster)
    ax.set_xlabel('Mean TF-IDF Score')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.xaxis.set_major_locator(MaxNLocator(nbins=3))
    ax.xaxis.set_major_formatter(FuncFormatter(format_ticks))

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.subplots_adjust(hspace=0.5)
plt.suptitle('Most and Least Studied Relationships', fontsize=24, y=1.02)
plt.savefig('Top 3 Most Represented and Underrepresented Relationships (BoA).pdf', bbox_inches='tight')
plt.show()