In [None]:
! pip install pandas==2.2.2 numpy==1.24.4 umap==0.1.1 umap-learn==0.5.7 scipy==1.13.1 scikit-learn==1.5.2 matplotlib==3.10.0 seaborn==0.13.2 scanpy==1.11.1 anndata==0.11.4 igraph==0.11.8 leidenalg==0.10.2 gensim==4.3.3

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scanpy as sc
from anndata import AnnData
import leidenalg
import re
from collections import defaultdict

Load preprocessed files

In [None]:
with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

with open('bow_corpus.pkl', 'rb') as f:
    bow_corpus = pickle.load(f)

with open('lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

with open('dictionary.dict', 'rb') as f:
    dictionary = pickle.load(f)

Filter documents with its main topic primarily related to the biology of aging

In [None]:
# Determine dominant topic for each document
def get_dominant_topic(ldamodel, bow_corpus):
    topics = ldamodel[bow_corpus]
    dominant_topics = []
    for topic_list in topics:
        dominant_topic = sorted(topic_list, key=lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append(dominant_topic)
    return dominant_topics

dominant_topics = get_dominant_topic(lda_model, bow_corpus)

# Filter documents with biology of aging related dominant topics (24, 0, 6, and 9 in our model)
filtered_docs = [doc for doc, topic in zip(preprocessed_docs, dominant_topics) if topic in {24, 0, 6, 9}]

print(f'Number of filtered documents: {len(filtered_docs)}')

filtered_texts = [' '.join(doc) for doc in filtered_docs]

# Save BoA filtered texts
with open("filtered_docs_biology.pkl", "wb") as f:
    pickle.dump(filtered_texts, f)

TF-IDF vectorization, PCA and UMAP of BoA documents

In [None]:
# Create TF-IDF vectorizer and transform the filtered texts
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

# Perform PCA on TF-IDF matrix
pca = PCA(n_components=30)
pca_result = pca.fit_transform(tfidf_matrix.toarray())

# Perform UMAP on PCA result
umap_model = umap.UMAP(n_neighbors=10, min_dist=0, n_components=2)
umap_result_biology = umap_model.fit_transform(pca_result)

# Plot UMAP
plt.figure(figsize=(18, 12))
plt.scatter(umap_result_biology[:, 0], umap_result_biology[:, 1], cmap='viridis', s=1, alpha=0.2)
plt.title('UMAP Visualization of BoA Documents')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.colorbar()
plt.show()

# Save UMAP embeddings
with open("umap_result_biology.pkl", "wb") as f:
    pickle.dump(umap_result_biology, f)

Leiden clustering

In [None]:
# Convert your PCA result into an AnnData object
adata = AnnData(X=pca_result)

# Compute the k-nearest neighbors graph and run Leiden clustering
sc.pp.neighbors(adata, n_neighbors=5, use_rep='X')
sc.tl.leiden(adata, resolution=0.7)

# Save Leiden labels
leiden_labels_biology = adata.obs['leiden']
leiden_labels_biology.to_csv('leiden_labels_biology.csv', index=False)

# Use original UMAP coordinates for plotting
plt.figure(figsize=(18, 12))
scatter = plt.scatter(umap_result_biology[:, 0], umap_result_biology[:, 1], c=adata.obs['leiden'].astype('category').cat.codes, cmap='tab20', s=0.5, alpha=0.2)
plt.colorbar(scatter, ticks=range(len(adata.obs['leiden'].unique())))
plt.title("UMAP with Leiden Clusters")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()

Calculate top 10 differential words between clusters based on TF-IDF score

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['cluster'] = adata.obs['leiden'].astype(int).values

# Compute mean TF-IDF scores for each word within each cluster
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['cluster'] = adata.obs['leiden'].astype(int).values
cluster_means = tfidf_df.groupby('cluster').mean()
differential_scores = {}
for cluster in cluster_means.index:
    other_clusters = cluster_means.drop(index=cluster)
    differential_scores[cluster] = cluster_means.loc[cluster] - other_clusters.max()

# Get the top N differential words for each cluster
N = 10
top_differential_words = {}

for cluster in differential_scores:
    sorted_terms = differential_scores[cluster].sort_values(ascending=False)
    top_differential_words[cluster] = sorted_terms.head(N).index.tolist()

# Convert each word to a string enclosed in single quotes for each cluster
top_differential_words_biology = {
    cluster: [f"'{word}'" for word in words] for cluster, words in top_differential_words.items()
}

top_differential_words_biology_df = pd.DataFrame(top_differential_words_biology).transpose()
top_differential_words_biology_df.columns = [f"Top {i+1}" for i in range(len(top_differential_words_biology_df.columns))]
top_differential_words_biology_df.to_csv('top_differential_words_biology.csv', index=True)

Calculate the presence of top 3 differential words (stems) of each cluster in every cluster

In [None]:
filtered_docs = [str(doc) for doc in filtered_docs]

# Load the CSV file with leiden labels
leiden_labels_biology = pd.read_csv('leiden_labels_biology.csv')

# Define the word groups for each cluster
word_groups = {
    0: ['variant', 'genet', 'allel'],
    1: ['gene', 'express', 'transcript'],
    2: ['cancer', 'tumor', 'line'],
    3: ['methyl', 'epigenet', 'modif'],
    4: ['insulin', 'igf', 'lifespan'],
    5: ['oxid', 'stress', 'antioxid'],
    6: ['stem', 'embryon', 'progenitor'],
    7: ['bone', 'msc', 'marrow'],
    8: ['transgen', 'mous', 'model'],
    9: ['activ', 'enzym', 'rat'],
    10: ['neuron', 'astrocyt', 'brain'],
    11: ['young', 'old', 'mice'],
    12: ['skin', 'mmp', 'inflammatori'],
    13: ['lymphocyt', 'immun', 'infect'],
    14: ['mutat', 'patient', 'caus'],
    15: ['mitochondri', 'atp', 'mtdna'],
    16: ['protein', 'domain', 'interact'],
    17: ['senesc', 'cellular', 'arrest'],
    18: ['dna', 'repair', 'damag'],
    19: ['beta', 'app', 'amyloid'],
    20: ['muscl', 'skelet', 'mass'],
    21: ['mir', 'mirna', 'target'],
    22: ['receptor', 'bind', 'densiti'],
    23: ['alpha', 'subunit', 'tnf'],
    24: ['telomer', 'telomeras', 'length'],
    25: ['sirt', 'mammalian', 'delai'],
    26: ['autophagi', 'degrad', 'homeostasi']
}

# Define cluster names
cluster_names = {
    0: 'GWAS',
    1: 'Gene regulation',
    2: 'Cancer',
    3: 'Epigenetics',
    4: 'Insulin/IGF pathway',
    5: 'Oxidative stress',
    6: 'Stem cells',
    7: 'Mesenchymal stem cells',
    8: 'Mouse models',
    9: 'Biochemistry',
    10: 'Neuroscience',
    11: 'Comparative studies',
    12: 'Skin',
    13: 'Immunology',
    14: 'Clinical genetics',
    15: 'Mitochondria',
    16: 'Protein biology',
    17: 'Senescence',
    18: 'Genomic stability',
    19: "Alzheimer's",
    20: 'Muscle',
    21: 'RNA biology',
    22: 'Receptor biology',
    23: 'Cytokines',
    24: 'Telomeres',
    25: 'Sirtuins & mTOR',
    26: 'Autophagy'
}

# Function to check for the presence of each word
unique_words = list(set(word for words_list in word_groups.values() for word in words_list))
word_presence_matrix = pd.DataFrame(0, index=cluster_names.values(), columns=word_groups.keys())
def check_word_group_presence(doc, word_group):
    return any(re.search(r'\b' + word + r'\b', doc) for word in word_group)

# Combine the necessary data by using the 'leiden' column from leiden_labels_biology
preprocessed_docs_df = pd.DataFrame(filtered_docs, columns=['Abstract'])
if len(preprocessed_docs_df) != len(leiden_labels_biology):
    raise ValueError("The lengths of preprocessed_docs and leiden_labels_biology do not match!")
preprocessed_docs_df['leiden'] = leiden_labels_biology['leiden'].values

# Ensure all entries in the 'Abstract' column are strings
preprocessed_docs_df['Abstract'] = preprocessed_docs_df['Abstract'].astype(str)

# Process the documents to check for word group presence
for cluster, cluster_name in cluster_names.items():
    cluster_docs = preprocessed_docs_df[preprocessed_docs_df['leiden'] == cluster]['Abstract']
    total_docs_in_cluster = len(cluster_docs)
    for group, words in word_groups.items():
        word_group_presence_count = sum(check_word_group_presence(doc, words) for doc in cluster_docs)
        word_presence_matrix.at[cluster_name, group] = word_group_presence_count / total_docs_in_cluster if total_docs_in_cluster > 0 else 0

In [None]:
# Calculate document counts per cluster
cluster_doc_counts = preprocessed_docs_df['leiden'].value_counts()
sorted_cluster_ids = cluster_doc_counts.sort_values(ascending=False).index.tolist()

# Sort the word_presence_matrix rows by cluster name using sorted cluster IDs
sorted_cluster_names = [cluster_names[i] for i in sorted_cluster_ids]
word_presence_matrix_sorted = word_presence_matrix.loc[sorted_cluster_names]

# Reorder the columns to match the sorted clusters
sorted_columns = sorted_cluster_ids
word_presence_matrix_sorted = word_presence_matrix_sorted[sorted_columns]

# Create sorted x-axis labels
x_labels = [', '.join([word.capitalize() for word in word_groups[i]]) for i in sorted_columns]

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(word_presence_matrix_sorted, cmap='RdBu_r', cbar=True, annot=False)
plt.ylabel('Clusters', fontsize=20)
plt.xlabel('Top Differential Words', fontsize=24)
plt.title('Top Differential Words Presence in BoA Clusters', fontsize=24)
plt.xticks(ticks=np.arange(len(x_labels)) + 0.5, labels=x_labels, rotation=90, fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('Heatmap of Word Presence in BoA Clusters.pdf', bbox_inches='tight')
plt.show()

Calculate and plot number of documents per cluster

In [None]:
# Convert UMAP results to a DataFrame
umap_df = pd.DataFrame(umap_result_biology, columns=['UMAP1', 'UMAP2'])

# Add leiden labels to the combined dataframe
combined_df = pd.concat([umap_df, leiden_labels_biology], axis=1)

# Replace cluster numbers with names
combined_df['Cluster Name'] = combined_df['leiden'].map(cluster_names)

# Count the number of documents per cluster
cluster_counts = combined_df['Cluster Name'].value_counts().reset_index()
cluster_counts.columns = ['Cluster Name', 'Count']
cluster_counts = cluster_counts.sort_values(by='Count', ascending=False)

# Plot
plt.figure(figsize=(24, 16))
sns.barplot(data=cluster_counts, x='Cluster Name', y='Count', palette='RdBu')
plt.xticks(rotation=90, fontsize=28)
plt.yticks(fontsize=28, fontfamily='serif')
plt.title('Number of Documents per BoA Cluster', fontsize=32)
plt.ylabel('Number of Documents', fontsize=28)
plt.xlabel('')
plt.savefig('Number of Documents per BoA Cluster.pdf', bbox_inches='tight')
plt.show()

Cosine similarity

In [None]:
# Assign TF-IDF vectors to clusters based on leiden labels
leiden_labels_biology['tfidf'] = list(tfidf_matrix.toarray())

# Group by cluster labels and compute the mean TF-IDF vector for each cluster
cluster_tfidf = leiden_labels_biology.groupby('leiden')['tfidf'].apply(lambda x: np.mean(np.stack(x.values), axis=0))

# Get cluster document counts and sort cluster IDs descending
cluster_doc_counts = preprocessed_docs_df['leiden'].value_counts()
sorted_cluster_ids = cluster_doc_counts.sort_values(ascending=False).index.tolist()
sorted_cluster_ids = [cid for cid in sorted_cluster_ids if cid in cluster_tfidf.index]

# Reorder the cluster_tfidf and compute cosine similarity matrix between TF-IDF vectors
cluster_tfidf_sorted = cluster_tfidf.loc[sorted_cluster_ids]
cos_sim_matrix_sorted = cosine_similarity(np.stack(cluster_tfidf_sorted.values))

# Build the sorted similarity DataFrame with cluster names
sorted_cluster_names = [cluster_names[cid] for cid in sorted_cluster_ids]
cos_sim_df_sorted = pd.DataFrame(
    cos_sim_matrix_sorted,
    index=sorted_cluster_names,
    columns=sorted_cluster_names
)

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(cos_sim_df_sorted, annot=False, cmap='RdBu_r')
plt.title('Cosine Similarity Between BoA Clusters Based on TF-IDF', fontsize=24)
plt.ylabel('Clusters', fontsize=20)
plt.xlabel('Clusters', fontsize=20)
plt.xticks(rotation=90, fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('Cosine Similarity Heatmap BoA.pdf', bbox_inches='tight')
plt.show()

Presence of the Hallmarks of Aging in each cluster

In [None]:
# Words of interest (stems)
words = ['oxid', 'inflamm', 'senesc', 'mitochondri', 'genom', 'stem', 'insulin', 'metabol',
        'telomer', 'epigenet', 'autophagi', 'nutrient', 'microbi', 'intercellular']

# Replace cluster labels with their corresponding names
preprocessed_docs_df['Cluster'] = preprocessed_docs_df['leiden'].map(cluster_names)

# Count the presence of each word in each cluster and calculate the percentage of documents containing that word in each cluster
word_presence = {cluster: [0] * len(words) for cluster in preprocessed_docs_df['Cluster'].unique()}
for cluster in preprocessed_docs_df['Cluster'].unique():
    cluster_docs = preprocessed_docs_df[preprocessed_docs_df['Cluster'] == cluster]['Abstract']
    for word in words:
        count = sum(word in doc for doc in cluster_docs)
        word_presence[cluster][words.index(word)] = count

cluster_doc_count = preprocessed_docs_df['Cluster'].value_counts().to_dict()
data = []
for cluster, counts in word_presence.items():
    total_docs = cluster_doc_count.get(cluster, 1)
    percentages = (np.array(counts) / total_docs) * 100
    for word, percentage in zip(words, percentages):
        data.append([cluster, word, percentage])
df_plot = pd.DataFrame(data, columns=['Cluster', 'Word', 'Percentage'])
df_plot['Cluster'] = pd.Categorical(df_plot['Cluster'], categories=cluster_names.values(), ordered=True)
print(df_plot.head())

In [None]:
# Get doc counts per cluster ID and sort descending
cluster_doc_counts = preprocessed_docs_df['leiden'].value_counts()

# Map cluster IDs to names
cluster_id_to_name = cluster_names
cluster_name_order = [cluster_id_to_name[cid] for cid in cluster_doc_counts.index if cid in cluster_id_to_name]
df_plot['Cluster'] = pd.Categorical(df_plot['Cluster'], categories=cluster_name_order, ordered=True)

# Capitalize words
df_plot['Word'] = df_plot['Word'].str.capitalize()

# Color gradient based on percentage of documents containing each stem
norm = plt.Normalize(df_plot['Percentage'].min(), df_plot['Percentage'].max())
cmap = sns.color_palette("RdBu_r", as_cmap=True)
df_plot['Color'] = df_plot['Percentage'].apply(lambda x: cmap(norm(x)))

# Plotting
plt.figure(figsize=(20, 15))
ax = sns.scatterplot(
    data=df_plot,
    x='Word',
    y='Cluster',
    size='Percentage',
    legend=False,
    sizes=(20, 400),
    hue='Percentage',
    palette=cmap,
    alpha=0.8
)

# Optional label tweak
def replace_mtor(label):
    if label == 'Mammalian':
        return 'Mammalian (mTOR)'
    return label

new_labels = [replace_mtor(label.get_text()) for label in ax.get_xticklabels()]
ax.set_xticklabels(new_labels, rotation=90, fontsize=12)

ax.set_xlabel('Word', fontsize=26)
ax.set_ylabel('BoA Cluster', fontsize=26)
plt.yticks(fontsize=22)
plt.xticks(fontsize=22)
plt.title('Presence of the Hallmarks of Aging in each BoA Cluster', fontsize=28)
plt.tight_layout()
plt.savefig('Hallmarks of Aging in BoA Clusters.pdf', bbox_inches='tight')
plt.show()