In [None]:
!pip install pandas numpy umap umap-learn matplotlib scikit-learn scanpy anndata igraph leidenalg collections

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scanpy as sc
from anndata import AnnData
import leidenalg
import re
from collections import defaultdict

Load preprocessed files

In [None]:
with open('/content/drive/MyDrive/Unsupervised aging literature/LDA model docs/LDA model 4 with cleaned data/preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

with open('/content/drive/MyDrive/Unsupervised aging literature/LDA model docs/LDA model 4 with cleaned data/bow_corpus.pkl', 'rb') as f:
    bow_corpus = pickle.load(f)

with open('/content/drive/MyDrive/Unsupervised aging literature/LDA model docs/LDA model 4 with cleaned data/lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

with open('/content/drive/MyDrive/Unsupervised aging literature/LDA model docs/LDA model 4 with cleaned data/dictionary.dict', 'rb') as f:
    dictionary = pickle.load(f)

Filter documents with its main topic primarily related to the biology of aging

In [None]:
# Determine dominant topic for each document
def get_dominant_topic(ldamodel, bow_corpus):
    topics = ldamodel[bow_corpus]
    dominant_topics = []
    for topic_list in topics:
        dominant_topic = sorted(topic_list, key=lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append(dominant_topic)
    return dominant_topics

dominant_topics = get_dominant_topic(lda_model, bow_corpus)

# Filter documents with biology of aging related dominant topics (24, 0, 6, and 9 in our model)
filtered_docs = [doc for doc, topic in zip(preprocessed_docs, dominant_topics) if topic in {24, 0, 6, 9}]

print(f'Number of filtered documents: {len(filtered_docs)}')

filtered_texts = [' '.join(doc) for doc in filtered_docs]

# Save BoA filtered texts
with open("filtered_docs_biology.pkl", "wb") as f:
    pickle.dump(filtered_texts, f)

TF-IDF vectorization, PCA and UMAP of BoA documents

In [None]:
# Create TF-IDF vectorizer and transform the filtered texts
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

# Perform PCA on TF-IDF matrix
pca = PCA(n_components=30)
pca_result = pca.fit_transform(tfidf_matrix.toarray())

# Perform UMAP on PCA result
umap_model = umap.UMAP(n_neighbors=10, min_dist=0, n_components=2)
umap_result_biology = umap_model.fit_transform(pca_result)

# Plot UMAP
plt.figure(figsize=(18, 12))
plt.scatter(umap_result_biology[:, 0], umap_result_biology[:, 1], cmap='viridis', s=1, alpha=0.2)
plt.title('UMAP Visualization of BoA Documents')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.colorbar()
plt.show()

# Save UMAP embeddings
with open("umap_result_biology.pkl", "wb") as f:
    pickle.dump(umap_result_biology, f)

Leiden clustering

In [None]:
# Convert your UMAP result into an AnnData object
adata = AnnData(X=umap_result_biology)

# Compute the k-nearest neighbors graph and run Leiden clustering
sc.pp.neighbors(adata, n_neighbors=20, use_rep='X')
sc.tl.leiden(adata, resolution=0.05)

# Save Leiden labels
leiden_labels_biology = adata.obs['leiden']
leiden_labels_biology.to_csv('leiden_labels_biology.csv', index=False)

# Use original UMAP coordinates for plotting
plt.figure(figsize=(18, 12))
scatter = plt.scatter(umap_result_biology[:, 0], umap_result_biology[:, 1], c=adata.obs['leiden'].astype('category').cat.codes, cmap='tab20', s=0.5, alpha=0.2)
plt.colorbar(scatter, ticks=range(len(adata.obs['leiden'].unique())))
plt.title("UMAP with Leiden Clusters")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()

Calculate top 10 differential words between clusters based on TF-IDF score

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['cluster'] = adata.obs['leiden'].astype(int).values

# Compute mean TF-IDF scores for each word within each cluster
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['cluster'] = adata.obs['leiden'].astype(int).values
cluster_means = tfidf_df.groupby('cluster').mean()
differential_scores = {}
for cluster in cluster_means.index:
    other_clusters = cluster_means.drop(index=cluster)
    differential_scores[cluster] = cluster_means.loc[cluster] - other_clusters.max()

# Get the top N differential words for each cluster
N = 10
top_differential_words = {}

for cluster in differential_scores:
    sorted_terms = differential_scores[cluster].sort_values(ascending=False)
    top_differential_words[cluster] = sorted_terms.head(N).index.tolist()

# Convert each word to a string enclosed in single quotes for each cluster
top_differential_words_biology = {
    cluster: [f"'{word}'" for word in words] for cluster, words in top_differential_words.items()
}

top_differential_words_biology_df = pd.DataFrame(top_differential_words_biology).transpose()
top_differential_words_biology_df.columns = [f"Top {i+1}" for i in range(len(top_differential_words_biology_df.columns))]
top_differential_words_biology_df.to_csv('top_differential_words_biology.csv', index=True)

Calculate the presence of top 3 differential words (stems) of each cluster in every cluster

In [None]:
filtered_docs = [str(doc) for doc in filtered_docs]

# Define the word groups for each cluster
word_groups = {
    0: ['receptor', 'rat', 'hormon'],
    1: ['oxid', 'stress', 'peroxid'],
    2: ['arrest', 'cycl', 'senesc'],
    3: ['factor', 'oocyt', 'inflammatori'],
    4: ['immun', 'infect', 'antibodi'],
    5: ['mitochondri', 'mitochondria', 'mtdna'],
    6: ['mice', 'app', 'transgen'],
    7: ['mutat', 'genet', 'famili'],
    8: ['cell', 'stem', 'adult'],
    9: ['cancer', 'tumor', 'lung'],
    10: ['protein', 'lifespan', 'kinas'],
    11: ['gene', 'express', 'transcript'],
    12: ['beta', 'alpha', 'tgf'],
    13: ['neuron', 'astrocyt', 'hippocamp'],
    14: ['repair', 'dna', 'damag'],
    15: ['bone', 'msc', 'marrow'],
    16: ['muscl', 'synthesi', 'bodi'],
    17: ['telomer', 'telomeras', 'length'],
    18: ['target', 'overexpress', 'downregul'],
    19: ['methyl', 'epigenet', 'chang'],
    20: ['autophagi', 'degrad', 'process'],
    21: ['sirt', 'mammalian', 'glucos'],
    22: ['skin', 'mmp', 'protect']
}

# Define cluster names
cluster_names = {
    0: 'Cell signaling',
    1: 'Oxidative stress',
    2: 'Senescence',
    3: 'Cell & molecular biology',
    4: 'Immunology',
    5: 'Mitochondria',
    6: 'Alzheimer\'s',
    7: 'Genetics (clinical)',
    8: 'Stem cells',
    9: 'Cancer',
    10: 'Protein biology',
    11: 'Genetics (biology)',
    12: 'Growth factors and cytokines',
    13: 'Neuroscience',
    14: 'Genomic stability',
    15: 'Mesenchymal stem cells',
    16: 'Muscle',
    17: 'Telomeres',
    18: 'RNA biology',
    19: 'Epigenetics',
    20: 'Autophagy',
    21: 'Sirtuins, mTOR and caloric restriction',
    22: 'Skin'
}


# Function to check for the presence of each word
unique_words = list(set(word for words_list in word_groups.values() for word in words_list))
word_presence_matrix = pd.DataFrame(0, index=cluster_names.values(), columns=word_groups.keys())
def check_word_group_presence(doc, word_group):
    return any(re.search(r'\b' + word + r'\b', doc) for word in word_group)

# Combine the necessary data by using the 'leiden' column from leiden_labels_biology
preprocessed_docs_df = pd.DataFrame(filtered_docs, columns=['Abstract'])
if len(preprocessed_docs_df) != len(leiden_labels_biology):
    raise ValueError("The lengths of preprocessed_docs and leiden_labels_biology do not match!")
preprocessed_docs_df['leiden'] = leiden_labels_biology['leiden'].values

# Ensure all entries in the 'Abstract' column are strings
preprocessed_docs_df['Abstract'] = preprocessed_docs_df['Abstract'].astype(str)

# Process the documents to check for word group presence
for cluster, cluster_name in cluster_names.items():
    cluster_docs = preprocessed_docs_df[preprocessed_docs_df['leiden'] == cluster]['Abstract']
    total_docs_in_cluster = len(cluster_docs)
    for group, words in word_groups.items():
        word_group_presence_count = sum(check_word_group_presence(doc, words) for doc in cluster_docs)
        word_presence_matrix.at[cluster_name, group] = word_group_presence_count / total_docs_in_cluster if total_docs_in_cluster > 0 else 0

In [None]:
# Plot the heatmap
x_labels = [', '.join([word.capitalize() for word in group]) for group in word_groups.values()]
word_presence_matrix = word_presence_matrix.sort_index(axis=1)

plt.figure(figsize=(16, 12))
sns.heatmap(word_presence_matrix, cmap='RdBu_r', cbar=True, annot=False)
plt.ylabel('Clusters', fontsize=20)
plt.xlabel('Top Differential Words', fontsize=20)
plt.title('Top Differential Words Presence in Clusters', fontsize=20)
plt.xticks(ticks=np.arange(len(x_labels))+0.5, labels=x_labels, rotation=90, fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('Heatmap of Word Presence in BoA Clusters.pdf', bbox_inches='tight')
plt.show()

Calculate and plot number of documents per cluster

In [None]:
# Convert UMAP results to a DataFrame
umap_df = pd.DataFrame(umap_result_biology, columns=['UMAP1', 'UMAP2'])

# Add leiden labels to the combined dataframe
combined_df = pd.concat([umap_df, leiden_labels_biology], axis=1)

# Replace cluster numbers with names
combined_df['Cluster Name'] = combined_df['leiden'].map(cluster_names)

# Count the number of documents per cluster
cluster_counts = combined_df['Cluster Name'].value_counts().reset_index()
cluster_counts.columns = ['Cluster Name', 'Count']
cluster_counts = cluster_counts.sort_values(by='Count', ascending=False)

# Plot
plt.figure(figsize=(24, 16))
sns.barplot(data=cluster_counts, x='Cluster Name', y='Count', palette='RdBu')
plt.xticks(rotation=90, fontsize=18)
plt.yticks(fontsize=18, fontfamily='serif')
plt.title('Number of Documents per Cluster', fontsize=28)
plt.ylabel('Number of Documents', fontsize=24)
plt.xlabel('')
plt.savefig('Number of Documents per BoA Cluster.pdf', bbox_inches='tight')
plt.show()

Cosine similarity

In [None]:
# Assign TF-IDF vectors to clusters based on leiden labels
leiden_labels_biology['tfidf'] = list(tfidf_matrix.toarray())

# Group by cluster labels and compute the mean TF-IDF vector for each cluster
cluster_tfidf = leiden_labels_biology.groupby('leiden')['tfidf'].apply(lambda x: np.mean(np.stack(x.values), axis=0))

# Compute cosine similarity matrix between cluster mean TF-IDF vectors
cos_sim_matrix = cosine_similarity(np.stack(cluster_tfidf.values))
valid_clusters = [label for label in cluster_tfidf.index if label in cluster_names]
cluster_tfidf = cluster_tfidf[valid_clusters]

# Create the cosine similarity DataFrame with correct labels
cos_sim_df = pd.DataFrame(cos_sim_matrix, index=[cluster_names[label] for label in valid_clusters], columns=[cluster_names[label] for label in valid_clusters])

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(cos_sim_df, annot=False, cmap='RdBu_r')
plt.title('Cosine Similarity Between Clusters Based on TF-IDF', fontsize=20)
plt.ylabel('Clusters', fontsize=20)
plt.xlabel('Clusters', fontsize=20)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('Cosine Similarity Heatmap BoA.pdf', bbox_inches='tight')
plt.show()

Presence of the Hallmarks of Aging in each cluster

In [None]:
# Words of interest (stems)
words = ['oxid', 'senesc', 'inflamm', 'mitochondri', 'genom', 'stem',
         'intercellular', 'telomer', 'epigenet', 'autophagi', 'mammalian', 'metabol', 'nutrient', 'microbi']

# Replace cluster labels with their corresponding names
df['Cluster'] = df['Cluster'].map(cluster_names)

# Count the presence of each word in each cluster and calculate the percentage of documents containing that word in each cluster
word_presence = {cluster: [0] * len(words) for cluster in df['Cluster'].unique()}
for cluster in df['Cluster'].unique():
    cluster_docs = df[df['Cluster'] == cluster]['Document']
    for word in words:
        count = sum(word in doc for doc in cluster_docs)
        word_presence[cluster][words.index(word)] = count

cluster_doc_count = df['Cluster'].value_counts().to_dict()
data = []
for cluster, counts in word_presence.items():
    total_docs = cluster_doc_count.get(cluster, 1)
    percentages = (counts / total_docs) * 100
    for word, percentage in zip(words, percentages):
        data.append([cluster, word, percentage])
df_plot = pd.DataFrame(data, columns=['Cluster', 'Word', 'Percentage'])
df_plot['Cluster'] = pd.Categorical(df_plot['Cluster'], categories=cluster_names.values(), ordered=True)
print(df_plot.head())

In [None]:
# Capitalize the words in df_plot for plotting
df_plot['Word'] = df_plot['Word'].str.capitalize()

# Create a color gradient for the 'Percentage' column
norm = plt.Normalize(df_plot['Percentage'].min(), df_plot['Percentage'].max())
cmap = sns.color_palette("RdBu_r", as_cmap=True)

# Create a column in the original dataframe for color coding based on 'Percentage'
df_plot['Color'] = df_plot['Percentage'].apply(lambda x: cmap(norm(x)))

# Plotting the dot plot with a color gradient and sizes proportional to percentages
plt.figure(figsize=(15, 10))
ax = sns.scatterplot(
    data=df_plot,
    x='Word',
    y='Cluster',
    size='Percentage',
    legend=False,
    sizes=(20, 400),
    hue='Percentage',
    palette=cmap,
    alpha=0.8
)

def replace_mtor(label):
    if label == 'Mammalian':
        return 'Mammalian (mTOR)'
    return label
new_labels = [replace_mtor(label.get_text()) for label in ax.get_xticklabels()]

ax.set_xticklabels(new_labels, rotation=90, fontsize=12)
ax.set_xlabel('Word', fontsize=16)
ax.set_ylabel('Cluster', fontsize=16)
plt.yticks(fontsize=12)
plt.title('Presence of the Hallmarks of Aging in each Cluster', fontsize=18)
plt.tight_layout()
plt.savefig('Hallmarks of Aging in BoA Clusters.pdf', bbox_inches='tight')
plt.show()