In [None]:
!pip install numpy==1.24.4 umap==0.1.1 umap-learn==0.5.7 pandas==2.2.2 scikit-learn==1.5.2 matplotlib==3.10.0 scipy==1.13.1 gensim==4.3.3

Collecting umap
  Using cached umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3542 sha256=d789940c47192ac27af4c1e854614759cc11fb013ee37f404143721228772af6
  Stored in directory: /root/.cache/pip/wheels/82/d8/73/e9eb3334baaad795ff0278363ff1aca7568bdf2793e452a527
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import entropy
from collections import defaultdict

Load preprocessed files

In [None]:
with open('filtered_docs_biology.pkl', 'rb') as f:
    filtered_docs_biology = pickle.load(f)

with open('umap_result_biology.pkl', 'rb') as f:
    umap_result_biology = pickle.load(f)

with open('preprocessed_docs.pkl', 'rb') as f:
    preprocessed_docs = pickle.load(f)

with open('lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

with open('bow_corpus.pkl', 'rb') as f:
    bow_corpus = pickle.load(f)

with open('dictionary.dict', 'rb') as f:
    dictionary = pickle.load(f)

# Load the CSV file containing all documents with Year, Abstract, and Processed columns
file_name = 'preprocessed_docs.csv'
df = pd.read_csv(file_name, low_memory=False)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df.reset_index(drop=True, inplace=True)

# Load the BoA Leiden labels
file_name = 'leiden_labels_biology.csv'
leiden_labels_biology = pd.read_csv(file_name)

Generate a CSV file containing only the filtered BoA documents with Year, Abstract and Processed columns

In [None]:
# Get the indices of the filtered docs and create a new dataframe
filtered_indices = [i for i, doc in enumerate(preprocessed_docs) if doc in filtered_docs_biology]
filtered_df_with_year = df.loc[filtered_indices]

file_name = 'preprocessed_docs_biology.csv'
filtered_df_with_year.to_csv(file_name, index=False)

df = filtered_df_with_year

Plot UMAP divided by decades

In [None]:
# Create a new column for year ranges
def classify_year_range(year):
    if 1975 <= year <= 1984:
        return '1975-1984'
    elif 1985 <= year <= 1994:
        return '1985-1994'
    elif 1995 <= year <= 2004:
        return '1995-2004'
    elif 2005 <= year <= 2014:
        return '2005-2014'
    elif 2015 <= year <= 2023:
        return '2015-2023'
    else:
        return 'Unknown'

df['Year_Range'] = filtered_df_with_year['Year'].apply(classify_year_range)

# Create a dataframe with umap embeddings and year range
umap_df = pd.DataFrame(umap_result_biology, columns=['UMAP1', 'UMAP2'])
combined_df = pd.concat([umap_df, df[['Year_Range']]], axis=1)
year_ranges = ['1975-1984', '1985-1994', '1995-2004', '2005-2014', '2015-2023']

# Plot individual UMAPs for each year range
plt.figure(figsize=(25, 5))
for i, year_range in enumerate(year_ranges):
    plt.subplot(1, 5, i + 1)
    subset = combined_df[combined_df['Year_Range'] == year_range]
    plt.scatter(subset['UMAP1'], subset['UMAP2'], c='navy', s=0.5, alpha=0.2)
    plt.title(f'UMAP Biology of Aging ({year_range})')
    plt.xticks([])
    plt.yticks([])
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)

plt.tight_layout()
plt.savefig('Year ranges UMAP BoA.png', dpi=600)
plt.show()

Temporal evolution of cluster proportions

In [None]:
# Define cluster names
cluster_names = {
    0: 'GWAS',
    1: 'Gene regulation',
    2: 'Cancer',
    3: 'Epigenetics',
    4: 'Insulin/IGF pathway',
    5: 'Oxidative stress',
    6: 'Stem cells',
    7: 'Mesenchymal stem cells',
    8: 'Mouse models',
    9: 'Biochemistry',
    10: 'Neuroscience',
    11: 'Comparative studies',
    12: 'Skin',
    13: 'Immunology',
    14: 'Clinical genetics',
    15: 'Mitochondria',
    16: 'Protein biology',
    17: 'Senescence',
    18: 'Genomic stability',
    19: "Alzheimer's",
    20: 'Muscle',
    21: 'RNA biology',
    22: 'Receptor biology',
    23: 'Cytokines',
    24: 'Telomeres',
    25: 'Sirtuins & mTOR',
    26: 'Autophagy'
}

# Add Leiden labels to the dataframe
df['Cluster'] = leiden_labels_biology['leiden']

# Group by year and cluster to get the count of documents per cluster per year
yearly_counts = df.groupby(['Year', 'Cluster']).size().unstack(fill_value=0)
total_docs_per_year = yearly_counts.sum(axis=1)
proportion_counts = yearly_counts.div(total_docs_per_year, axis=0)
moving_avg = proportion_counts.rolling(window=5, min_periods=1).mean()

# Filter data for the years 1975 to 2023
moving_avg_filtered = moving_avg[(moving_avg.index >= 1975) & (moving_avg.index <= 2023)]

# Calculate total documents per cluster
total_per_cluster = yearly_counts.sum().sort_values(ascending=False)

# Re-order cluster_names based on total document count
sorted_cluster_names = {cluster: cluster_names[cluster] for cluster in total_per_cluster.index if cluster in cluster_names}

# Plot the results in a single PDF
fig, axes = plt.subplots(nrows=6, ncols=6, figsize=(40, 25))
axes = axes.flatten()

for idx, (cluster, name) in enumerate(sorted_cluster_names.items()):
    if cluster in moving_avg_filtered.columns:
        axes[idx].plot(moving_avg_filtered.index, moving_avg_filtered[cluster], linestyle='-', color='navy')
        axes[idx].set_title(f'{name}', fontsize=24)
        axes[idx].set_xlabel('Year', fontsize=24)
        axes[idx].set_ylabel('Proportion of Documents', fontsize=20)
        axes[idx].tick_params(axis='both', which='major', labelsize=20)
        axes[idx].grid(False)
    else:
        axes[idx].axis('off')

# Hide any unused subplots
for ax in axes[len(sorted_cluster_names):]:
    ax.set_visible(False)
plt.tight_layout()
plt.savefig('Cluster_Evolution_Panel_BoA.pdf')
plt.show()

Calculate mean publication year per cluster

In [None]:
# Filter documents from 1975 or newer and add leiden labels
df_filtered = df[df['Year'] >= 1975].copy().reset_index(drop=True)
df_filtered.loc[:, 'Cluster'] = leiden_labels_biology['leiden'].reset_index(drop=True)

# Calculate the mean year for each cluster
mean_year_per_cluster = df_filtered.groupby('Cluster')['Year'].mean().reset_index()
mean_year_per_cluster['Cluster_Name'] = mean_year_per_cluster['Cluster'].map(cluster_names)

# Plot the mean year for each cluster
plt.figure(figsize=(12, 8))
mean_year_per_cluster_sorted = mean_year_per_cluster.sort_values('Year', ascending=False)
sns.barplot(x='Year', y='Cluster_Name', data=mean_year_per_cluster_sorted, palette='RdBu')
plt.xlabel('Mean Year of Publication')
plt.ylabel('Cluster')
plt.title('Mean Year of Publication per BoA Cluster')
plt.xlim(2008, 2012)
plt.xticks(ticks=range(2008, 2013, 2))
plt.savefig('Mean year of publication per BoA cluster.pdf', bbox_inches='tight')
plt.show()

Cosine similarity trend

In [None]:
# Get topic distribution for each document using LDA model
topic_distributions = []
for doc_bow in bow_corpus:
    doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    topic_vector = np.zeros(lda_model.num_topics)
    for topic_id, prob in doc_topics:
        topic_vector[topic_id] = prob
    topic_distributions.append(topic_vector)

df_topics = pd.DataFrame(topic_distributions)

# Add cluster labels and years
df['Leiden_Cluster'] = leiden_labels_biology['leiden']
df_topics['Year'] = df['Year']
df_topics['Cluster'] = df['Leiden_Cluster']

# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

# Analyze topic distributions over time within each cluster
results = defaultdict(list)
years = sorted(df['Year'].dropna().unique())

for cluster in df['Leiden_Cluster'].unique():
    cluster_data = df_topics[df_topics['Cluster'] == cluster]
    for i in range(len(years) - 1):
        year_1 = years[i]
        year_2 = years[i + 1]

        # Get average topic distribution for both years
        dist_year_1 = cluster_data[cluster_data['Year'] == year_1].mean(axis=0).values[:-2]
        dist_year_2 = cluster_data[cluster_data['Year'] == year_2].mean(axis=0).values[:-2]

        # Compute cosine similarity between distributions
        cosine_sim = cosine_similarity(dist_year_1, dist_year_2)
        results[cluster].append({
            'Year_1': year_1,
            'Year_2': year_2,
            'Cosine_Similarity': cosine_sim,
        })

# Convert results to DataFrame for analysis and visualization
df_results = pd.DataFrame([
    {'Cluster': cluster, 'Year_1': res['Year_1'], 'Year_2': res['Year_2'],
     'Cosine_Similarity': res['Cosine_Similarity']}
    for cluster, result in results.items() for res in result
])

In [None]:
# Filter data from the year 1975 onwards
df_results_filtered = df_results[df_results['Year_1'] >= 1975].copy()
unique_clusters = sorted(df_results_filtered['Cluster'].unique())

# Prepare a dictionary to store trends
cluster_trends = {}

# Iterate over each cluster to calculate the trend
for cluster in unique_clusters:
    cluster_data = df_results_filtered[df_results_filtered['Cluster'] == cluster].copy()
    # Compute the 5-year moving average
    cluster_data['Cosine_MA'] = cluster_data['Cosine_Similarity'].rolling(window=5).mean()
    cluster_data = cluster_data.dropna(subset=['Cosine_MA'])

    # Fit a linear regression model to find the slope
    X = cluster_data['Year_1'].values.reshape(-1, 1)
    y = cluster_data['Cosine_MA'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    cluster_trends[cluster_names[cluster]] = slope

# Sort the clusters by trend (slope)
sorted_trends = sorted(cluster_trends.items(), key=lambda x: x[1])

# Bar plot of cosine similarity trends
cluster_names_sorted = [item[0] for item in sorted_trends]
slopes_sorted = [item[1] for item in sorted_trends]
plt.figure(figsize=(10, 8))
plt.barh(cluster_names_sorted, slopes_sorted, color='navy')
plt.xlabel('Trend (Slope of 5-Year Moving Average)')
plt.ylabel('Cluster')
plt.title('Trend of Cosine Similarity Across BoA Clusters')
plt.savefig('Trend of Cosine Similarity Across BoA Clusters.pdf', bbox_inches='tight')
plt.show()