# Investigate gene mentions across publications
- Calculate total gene mentions, most common gene mentions, gene mentions per article and average gene mentions
- Create output figures
- Investigate articles with very high number of gene mentions (>= 20 genes per article)

## 1) Load libraries and datasets

In [None]:
import os
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
print("Success!")

In [None]:
# Set the working directory and file paths
output_directory = "OUTPUT_DIRECTORY"
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

# Set the working directory and file paths
genes_file = "civic_genes.csv"
civic_df = pd.read_csv(genes_file, header=None)
NER_gene_list = pd.read_csv("BioBERT_file.csv")
print("\nTotal articles: {:,}".format(len(NER_gene_list)))
print(NER_gene_list)

## 2) Calculate gene mentions

In [None]:
# Get list of relevant gene columns from civic_df
relevant_columns = civic_df.iloc[:, 0].tolist()
total_articles = len(NER_gene_list)
column_sums = NER_gene_list[relevant_columns].sum().sort_values(ascending=False)
total_gene_mentions = column_sums.sum()
column_sums_formatted = column_sums.apply(lambda x: f"{x:,.0f} ({(x / total_gene_mentions * 100):.2f}%)")
gene_article_counts = (NER_gene_list[relevant_columns] > 0).sum().sort_values(ascending=False)
gene_article_counts_formatted = gene_article_counts.apply(
    lambda x: f"{x:,} ({(x / total_articles * 100):.2f}%)"
)

total_mentions = NER_gene_list['Sum_Gene_Mentions'].sum()
average_mentions = NER_gene_list['Sum_Gene_Mentions'].mean()
mentions_distribution = NER_gene_list['Sum_Gene_Mentions'].value_counts().sort_index()
mentions_distribution_formatted = mentions_distribution.apply(
    lambda x: f"{x:,} ({(x / total_articles * 100):.2f}%)"
)

# Ouput
print("\nTotal mentions per gene (with % of all gene mentions):\n")
print(column_sums_formatted.to_string())

print("\nNumber of articles each gene appears in (with % of total articles):\n")
print(gene_article_counts_formatted.to_string())

print("\nTop 10 genes by total mentions:\n")
print(column_sums_formatted.head(10).to_string())

print("\nTop 10 genes by article count:\n")
print(gene_article_counts_formatted.head(10).to_string())

print("\nTotal number of articles: {:,}".format(total_articles))
print("Total sum of 'Sum_Gene_Mentions': {:,}".format(total_mentions))
print("Average gene mentions per article: {:,.2f}".format(average_mentions))

print("\nDistribution of gene mentions per article (with % of total articles):\n")
print(mentions_distribution_formatted.to_string())

## 3) Create figure and overview of gene mentions

In [None]:
# Plot 1: Mentions of individual genes over time
plt.style.use('seaborn-v0_8-muted')
mpl.rcParams['font.size'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
gene_columns = civic_df.iloc[:, 0].tolist()
mentions_by_year = NER_gene_list.groupby('PubYear')[gene_columns].sum()
palette = sns.color_palette("husl", len(gene_columns)) 
fig, ax = plt.subplots(figsize=(11,8))  
for gene, color in zip(gene_columns, palette):
    ax.plot(mentions_by_year.index, mentions_by_year[gene], label=gene, linewidth=2, color=color)
ax.set_title("Mentions of individual genes over time", fontsize=14, pad=20)
ax.set_xlabel("Publication year", fontsize=12)
ax.set_ylabel("Number of mentions", fontsize=12)
ax.legend(title="Gene", loc='upper left', fontsize=9, title_fontsize=10, frameon=False)
ax.grid(axis='y', linestyle='--', alpha=0.6)
fig.tight_layout()
fig.savefig("plot1_mentions_over_time.png", dpi=300, bbox_inches='tight')
print("Plot 1 saved as: plot1_mentions_over_time.png")
plt.show()

In [None]:
# Plot 2: Distribution of genes mentioned per article over time
plt.style.use('seaborn-v0_8-muted')
mpl.rcParams['font.size'] = 11
mpl.rcParams['axes.titleweight'] = 'bold'
gene_columns = civic_df.iloc[:, 0].tolist()
NER_gene_list['Genes_Mentioned_Count'] = NER_gene_list[gene_columns].gt(0).sum(axis=1)
mentions_over_time = (
    NER_gene_list.groupby(['PubYear', 'Genes_Mentioned_Count'])
    .size()
    .reset_index(name='Article_Count')
)
mentions_pivot = mentions_over_time.pivot(index='PubYear', columns='Genes_Mentioned_Count', values='Article_Count').fillna(0)
mentions_pivot = mentions_pivot[sorted(mentions_pivot.columns)]
fig, ax = plt.subplots(figsize=(11, 8)) 
colors = sns.color_palette("YlGnBu", n_colors=len(mentions_pivot.columns))
mentions_pivot.plot(kind='bar', stacked=True, ax=ax, color=colors, edgecolor='black')
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))
total_articles = len(NER_gene_list)
average_mentions = NER_gene_list['Sum_Gene_Mentions'].mean()
ax.set_title("Distribution of genes mentioned per article over time", fontsize=14, pad=15)
ax.set_xlabel("Publication year", fontsize=12)
ax.set_ylabel("Number of articles", fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], title="Genes per article",
          loc='upper left', fontsize=9, title_fontsize=10, frameon=False)
ax.grid(axis='y', linestyle='--', alpha=0.6)
fig.tight_layout(rect=[0, 0.05, 1, 0.95])
fig.savefig("plot2_distribution_per_article.png", dpi=300, bbox_inches='tight')
print(" Plot 2 saved as: plot2_distribution_per_article.png")
plt.show()