# Investigate gene mentions across publications
- Calculate total gene mentions, most common gene mentions, gene mentions per article and average gene mentions
- Create output figures
- Investigate articles with very high number of gene mentions (>= 20 genes per article)

## 1) Load libraries and datasets

In [None]:
import os
import time
import datetime
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import numpy as np
print("Success!")

In [None]:
# Print current working directory
print("Current Working Directory:", os.getcwd())

# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"

# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
# Read the CSV file
NER_gene_list = pd.read_csv("cleaned_BioBERT_data.csv")
print("\nTotal articles: {:,}".format(len(NER_gene_list)))
print(NER_gene_list)

In [None]:
os.chdir(input_directory)
print("Current Working Directory:", os.getcwd())
oncomine_df = pd.read_csv("oncomine_ngs_panel.csv", header=None)
print("Oncomine DataFrame:\n", oncomine_df)
print("Total Rows in oncomine_df:", len(oncomine_df))

# Change to output directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

## 2) Calculate gene mentions

In [None]:
# Summing the columns specified in oncomine_df and ranking them
relevant_columns = oncomine_df.iloc[:, 0].tolist()
column_sums = NER_gene_list[relevant_columns].sum().sort_values(ascending=False)
column_sums_formatted = column_sums.apply(lambda x: f"{x:,.0f}")

total_mentions = NER_gene_list['Sum_Gene_Mentions'].sum()
average_mentions = NER_gene_list['Sum_Gene_Mentions'].mean()

mentions_distribution = NER_gene_list['Sum_Gene_Mentions'].value_counts().sort_index()

mentions_distribution_formatted = mentions_distribution.apply(lambda x: f"{x:,}")

print("\nRanking of column sums from oncomine_df:\n")
print(column_sums_formatted.to_string())

print("\nTop 10 genes mentioned the most:\n")
print(column_sums_formatted.head(10).to_string())

print("\nTotal articles: {:,}".format(len(NER_gene_list)))
print("Total sum of 'Sum_Gene_Mentions': {:,}".format(total_mentions))
print("Average number of gene mentions per article: {:,.2f}".format(average_mentions))

print("\nDistribution of Sum_Gene_Mentions counts:\n")
print(mentions_distribution_formatted.to_string())

## 3) Create figure and overview of gene mentions

In [None]:
# Figure 1: Bar chart for ranking of summed relevant columns

%matplotlib inline  
fig, ax = plt.subplots(figsize=(14, 7))

# Sorting and calculating percentages
top_genes = column_sums.sort_values(ascending=False).head(15)
total_articles = len(NER_gene_list)
percentages = (top_genes / total_articles) * 100
average_mentions = column_sums.sum() / total_articles 

# Color gradient
colors = [plt.cm.Blues(1 - (i / len(top_genes))) for i in range(len(top_genes))]

# Plot the bar chart
bars = top_genes.plot(kind='bar', color=colors, edgecolor='black', ax=ax)
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))
ax.set_ylim(0, top_genes.max() * 1.2) 
for bar, value, percent in zip(bars.patches, top_genes, percentages):
    ax.text(
        bar.get_x() + bar.get_width() / 2, 
        bar.get_height() + (top_genes.max() * 0.04), 
        f"{int(value):,}\n({percent:.1f}%)", 
        ha='center', va='bottom', fontsize=10, fontweight='bold', color='black'
    )

# Titles and labels in sentence case
ax.set_title(f'Top 15 genes mentioned by articles (n={total_articles:,.0f} articles)', 
             fontsize=14, fontweight='bold', pad=15)
ax.set_ylabel('Number of articles mentioning gene', fontsize=12)
ax.set_xlabel('Gene (out of 161 Oncomine NGS panel)', fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
fig.tight_layout(rect=[0, 0.05, 1, 0.9])
plt.show()

In [None]:
# Figure 2: Improved Plot for Sum_Gene_Mentions Distribution
%matplotlib inline  
fig, ax = plt.subplots(figsize=(12, 7))

# Create bar plot with green gradient colors
colors = sns.color_palette("Greens_r", len(mentions_distribution))  
ax = sns.barplot(x=mentions_distribution.index, y=mentions_distribution.values, 
                 palette=colors, edgecolor='black')
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))
ax.set_title(f'Distribution of gene mentions per article (n={total_articles:,.0f} articles)', 
             fontsize=14, fontweight='bold', pad=25)
plt.figtext(0.5, 0.6, f'Average number of gene mentions per article: {average_mentions:,.2f}', 
            fontsize=12, color='gray', ha='center')
ax.set_xlabel('Number of genes mentioned', fontsize=12, labelpad=10)
ax.set_ylabel('Number of articles', fontsize=12)
ax.set_xticks(range(len(mentions_distribution.index)))
ax.set_xticklabels(mentions_distribution.index, rotation=45, ha='right', fontsize=10)
ax.set_ylim(0, max(mentions_distribution.values) * 1.2)
ax.grid(axis='y', linestyle='--', alpha=0.7)
for bar, value in zip(ax.patches, mentions_distribution.values):
    ax.text(
        bar.get_x() + bar.get_width() / 2,  
        bar.get_height() + (max(mentions_distribution.values) * 0.02),  
        f"{value:,}",
        ha='center', va='bottom', fontsize=9, fontweight='bold', color='black'
    )
fig.tight_layout(rect=[0, 0.05, 1, 0.85])
plt.show()

## 4) Investigate high numbers of gene mentions in PaperTitle and Abstracts

In [None]:
# Filter the dataset where 'Sum_Gene_Mentions' is greater than 20
high_gene_mentions_df = NER_gene_list[NER_gene_list['Sum_Gene_Mentions'] > 20]

high_gene_mentions_df = high_gene_mentions_df[['PaperTitle', 'Abstract', 'Sum_Gene_Mentions']]
high_gene_mentions_df = high_gene_mentions_df.sort_values(by='Sum_Gene_Mentions', ascending=False)
count_above_20 = len(high_gene_mentions_df)

output_file = "high_gene_mentions.txt"

with open(output_file, "w", encoding="utf-8") as f:
    f.write(f"Total number of rows where Sum_Gene_Mentions > 20: {count_above_20}\n\n")
    f.write(high_gene_mentions_df.to_string(index=False))

print(f"\nFiltered and sorted dataset saved to {output_file}")
print(f"\nTotal number of articles where Sum_Gene_Mentions > 20: {count_above_20} articles\n\n")
print(high_gene_mentions_df)

#### Check dataset and varify all articles with high gene mentions