# Extract 161 Oncomine genes with BioBERT and MyGene.info API

- Define genes of interest, i.e., 161 Oncomine NGS panel
- Load OpenAlex dataset after full cleaning
- Load BioBERT model to detect genes
- Connect to MyGene.info API to receive gene-associated products (i.e., proteins) and synonyms
- Extract all publications with gene mentions, and create binary matrix
- Drop all publications without gene mentions, and create new dataset of relevant articles

## 1) Install libraries and load dataset

In [None]:
!pip install transformers
from transformers import pipeline
# Load  BioBERT model
biobert_model = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1")
print("BioBERT model loaded successfully!")

In [None]:
import pandas as pd
import os
import re
import time
from fuzzywuzzy import process, fuzz
from tqdm import tqdm
import torch
print("Import successful!")

In [None]:
# Set the working directory and file paths
working_directory = "WORKING _DIRECTORY"
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "CLEANED_ARTICLE_FILE.csv"
genes_file = "GENS_OF_INTEREST.csv"

# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
# Load OncoMine genes file
output_directory 
os.chdir(input_directory)
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()
print("Genes import successful!")

# Load the articles file
os.chdir(output_directory)
articles = pd.read_csv(articles_file)
print("Article import successful!")

In [None]:
# Display dataset info
print(f"Loaded full dataset of {len(articles):,} articles")
print(f"Loaded {len(gene_list):,} oncomine genes")

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]
print(f"\nThe articles df contains {num_rows:,} rows and {num_columns:,} columns.")

# Display column names in the df
column_names = articles.columns.tolist()

print("The column names in the articles df are:")
for col in column_names:
    print(col)
    
# Current extraction
article_df=articles
print(f"\nThis execution will be processing: {len(article_df):,} articles")

In [None]:
# Load BioBERT Genetic NER Model
biobert_model = pipeline(
    "ner",
    model="alvaroalon2/biobert_genetic_ner",
    tokenizer="alvaroalon2/biobert_genetic_ner",
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)
print("BioBERT model loaded successfully!")

## 2) Extract gene mentions in articles

In [None]:
# Function to fetch gene synonyms and associated proteins (from MyGene.info API)
def get_gene_synonyms(gene_symbol):
    """Fetches known synonyms, including protein products, for a given gene from MyGene.info."""
    url = f"https://mygene.info/v3/query?q={gene_symbol}&fields=symbol,alias,other_names"
    try:
        response = requests.get(url).json()
        synonyms = set()
        for hit in response.get("hits", []):
            if "symbol" in hit:
                synonyms.add(hit["symbol"].upper())
            if "alias" in hit:
                synonyms.update([alias.upper() for alias in hit["alias"]]) 
            if "other_names" in hit:
                synonyms.update([name.upper() for name in hit["other_names"]])
        return synonyms
    except:
        return {gene_symbol.upper()}


# Step 1: Expand the gene list dynamically with synonyms and protein names
expanded_gene_list = {}
for gene in gene_list:
    expanded_gene_list[gene.upper()] = get_gene_synonyms(gene)
print(f"Expanded gene list contains {len(expanded_gene_list)} genes with synonyms.")


# Function to normalize and reconstruct gene names from BioBERT
def normalize_extracted_genes(found_terms):
    """Normalize and map extracted entities to closest known gene or protein names."""
    normalized_genes = set()

    for term in found_terms:
        term_upper = term.upper()  # Convert to uppercase for case-insensitive matching

        # Step 1: Direct match with expanded gene list
        if term_upper in expanded_gene_list:
            normalized_genes.add(term_upper)
            continue

        #Step 2: Handle cases with hyphens or brackets
        cleaned_term = re.sub(r"[\[\]\(\),-]", " ", term_upper)  # Remove special chars
        cleaned_words = cleaned_term.split()  # Split into individual words

        # If one of the words is a known gene, add it
        for word in cleaned_words:
            if word in expanded_gene_list:
                normalized_genes.add(word)

        # Step 3: Apply fuzzy matching only if no direct match
        if not any(gene in normalized_genes for gene in cleaned_words):
            match = process.extractOne(term_upper, expanded_gene_list.keys(), scorer=fuzz.ratio)
            if match:
                best_match, score = match[:2]  # Extract first two elements only
                if score > 85:  # Set threshold for fuzzy match
                    normalized_genes.add(best_match)

    return normalized_genes
print("Success!")

In [None]:
# Function to process text with BioBERT
def process_biobert(text, model):
    """Runs BioBERT NER, extracts genes, normalizes, and maps them using MyGene.info."""
    if pd.isna(text) or len(text.strip()) == 0:
        return set()

    results = model(text)
    found_terms = set()
    current_term = []

    for res in results:
        word = res["word"].replace("##", "") 
        if res["entity"].startswith("B-"):  
            if current_term:
                full_term = "".join(current_term)
                found_terms.add(full_term)
            current_term = [word]
        elif res["entity"].startswith("I-"): 
            current_term.append(word)

    # Add last detected term
    if current_term:
        full_term = "".join(current_term)
        found_terms.add(full_term)

    # Normalize extracted gene names
    return normalize_extracted_genes(found_terms)
print("Success!")

# =================================================

## 3) Run gene extraction with BioBERT and sliding window approach

In [None]:
# Gene extration with BioBERT and slinding window approach

# Function to fetch gene synonyms from MyGene.info API
def get_gene_synonyms(gene_symbol):
    """Fetches known synonyms, including protein products, for a given gene from MyGene.info."""
    url = f"https://mygene.info/v3/query?q={gene_symbol}&fields=symbol,alias,other_names"
    try:
        response = requests.get(url).json()
        synonyms = set()
        for hit in response.get("hits", []):
            if "symbol" in hit:
                synonyms.add(hit["symbol"].upper())
            if "alias" in hit:
                synonyms.update([alias.upper() for alias in hit["alias"]])
            if "other_names" in hit:
                synonyms.update([name.upper() for name in hit["other_names"]])
        return synonyms
    except:
        return {gene_symbol.upper()} 

# Expand the gene list dynamically with synonyms
expanded_gene_list = {gene.upper(): get_gene_synonyms(gene) for gene in gene_list}
print(f"Expanded gene list contains {len(expanded_gene_list)} genes with synonyms.")

# Function to normalize extracted genes
def normalize_extracted_genes(found_terms):
    """Normalize and map extracted entities to closest known gene or protein names."""
    normalized_genes = set()
    for term in found_terms:
        term_upper = term.upper()
        if term_upper in expanded_gene_list:
            normalized_genes.add(term_upper)
            continue
        cleaned_term = re.sub(r"[\[\]\(\),-]", " ", term_upper)
        cleaned_words = cleaned_term.split()
        for word in cleaned_words:
            if word in expanded_gene_list:
                normalized_genes.add(word)
        if not any(gene in normalized_genes for gene in cleaned_words):
            match = process.extractOne(term_upper, expanded_gene_list.keys(), scorer=fuzz.ratio)
            if match:
                best_match, score = match[:2]
                if score > 85:
                    normalized_genes.add(best_match)
    return normalized_genes

# Function to split text into overlapping chunks for NER
def sliding_window_chunking(text, tokenizer, max_tokens=512, stride=256):
    """Splits text into overlapping chunks to avoid losing context."""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) <= max_tokens:
        return [tokenizer.decode(tokens, skip_special_tokens=True)]
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i : i + max_tokens]
        if len(chunk) < max_tokens:
            break
        chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))
    return chunks

# Function to process text with BioBERT using sliding window
def process_biobert(text, model):
    """Runs BioBERT NER with sliding window chunking."""
    if pd.isna(text) or len(text.strip()) == 0:
        return set()
    tokenizer = model.tokenizer  # Get tokenizer
    text_chunks = sliding_window_chunking(text, tokenizer)
    found_terms = set()
    for chunk in text_chunks:
        results = model(chunk)
        current_term = []
        for res in results:
            word = res["word"].replace("##", "")
            if res["entity"].startswith("B-"):
                if current_term:
                    full_term = "".join(current_term)
                    found_terms.add(full_term)
                current_term = [word]
            elif res["entity"].startswith("I-"):
                current_term.append(word)
        if current_term:
            full_term = "".join(current_term)
            found_terms.add(full_term)
    return normalize_extracted_genes(found_terms)

print("Success!")

##### Gene extraction #####
start_time = time.strftime("%Y-%m-%d %H:%M:%S")
start_timestamp = time.time()
print(f"Processing {len(article_df)} articles with BioBERT. Started at {start_time}")

biobert_results = []
for index, row in tqdm(article_df.iterrows(), total=len(article_df), desc="Processing Articles"):
    title = row.get("PaperTitle", "")
    abstract = row.get("Abstract", "")
    genes_biobert = process_biobert(title, biobert_model) | process_biobert(abstract, biobert_model)
    biobert_results.append(", ".join(genes_biobert))
    print(f"Article {index+1}: {genes_biobert}")

# Create df_results while keeping all columns
df_results = article_df.copy()
df_results["BioBERT"] = biobert_results
num_articles = len(df_results)

# Generate filenames
output_file = f"sliding_window_filtered_articles_biobert_expanded_{num_articles}.csv"
runtime_file = f"sliding_window_filtered_articles_biobert_expanded_{num_articles}_runtime.txt"

end_time = time.strftime("%Y-%m-%d %H:%M:%S")
end_timestamp = time.time()
total_runtime = end_timestamp - start_timestamp
print(f"Processing completed at {end_time}. Total runtime: {total_runtime:.2f} seconds.")

# Save runtime details in a text file
with open(runtime_file, "w") as f:
    f.write(f"Processing of articles: {num_articles}\n")
    f.write(f"Processing started at: {start_time}\n")
    f.write(f"Processing completed at: {end_time}\n")
    f.write(f"Total runtime: {total_runtime:.2f} seconds\n")
print(f"Runtime details saved in: {runtime_file}")

In [None]:
# Save the etxraction as csv
os.chdir(output_directory)
output_file = f"sliding_window_filtered_articles_biobert_expanded_{num_articles}.csv"

# Save full DataFrame to CSV, keeping all columns
df_results.to_csv(output_file, index=False)

In [None]:
# Binary matrix creation
os.chdir(output_directory)

input_file = f"sliding_window_filtered_articles_biobert_expanded_2128318.csv"
BioBERT_dfslw = pd.read_csv(input_file)
print(BioBERT_dfslw.head(20))

# KEEP THE ORIGINAL BioBERT COLUMN UNCHANGED 
BioBERT_originalslw = BioBERT_dfslw["BioBERT"].copy()
BioBERT_dfslw["BioBERT"] = BioBERT_dfslw["BioBERT"].fillna("").astype(str)
print(BioBERT_dfslw.head(20))

# Binary matrix creation
os.chdir(output_directory)


In [None]:
# Extract ALL genes from BioBERT column correctly
BioBERT_dfslw["Extracted_Genesslw"] = BioBERT_dfslw["BioBERT"].apply(lambda x: [gene.strip() for gene in x.split(',') if gene.strip()])

# Create binary matrix for all genes in gene_list
binary_gene_dataslw = {gene: BioBERT_dfslw["Extracted_Genesslw"].apply(lambda genes: 1 if gene in genes else 0) for gene in gene_list}

# Convert to df and merge with BioBERT_df
binary_gene_dfslw = pd.DataFrame(binary_gene_dataslw)
BioBERT_dfslw = pd.concat([BioBERT_dfslw, binary_gene_dfslw], axis=1)

# Add a sum column to count gene mentions
BioBERT_dfslw["Sum_Gene_Mentions"] = binary_gene_dfslw.sum(axis=1)

# Restore the original BioBERT column (unchanged)
BioBERT_dfslw["BioBERT"] = BioBERT_originalslw

# Drop temporary extracted genes column
BioBERT_dfslw.drop(columns=["Extracted_Genesslw"], inplace=True)

print(BioBERT_dfslw.head(20))

# Save as CSV
# Change the working directory
os.chdir(output_directory)

output_filename = f"sliding_window_gene_binary_matrix_BioBERT_MyGeneinfo_{num_articles}.csv"
BioBERT_dfslw.to_csv(output_filename, index=False)

print(f"File saved as: {output_filename}")

In [None]:
# Ensure all gene columns are numeric
binary_gene_columnsslw = [col for col in BioBERT_dfslw.columns if col in gene_list]  # Select only valid gene columns
BioBERT_dfslw[binary_gene_columnsslw] = BioBERT_dfslw[binary_gene_columnsslw].apply(pd.to_numeric, errors='coerce')

# Calculate the total sum of the "Sum_Gene_Mentions" column
total_gene_mentionsslw = BioBERT_dfslw["Sum_Gene_Mentions"].sum()

# Calculate the total number of 1s in the binary matrix (ensuring correct columns are used)
total_binary_sumslw = BioBERT_dfslw[binary_gene_columnsslw].sum().sum()

# Create a dictionary with results
results_dict_slw = {
    "Metric": ["Total_Sum_Gene_Mentions", "Total_Binary_Matrix_Sum"],
    "Value": [total_gene_mentionsslw, total_binary_sumslw]
}

# Convert to DataFrame
results_df_slw = pd.DataFrame(results_dict_slw)

# Save to CSV file (tab-separated for .txt format)
results_df_slw.to_csv("sliding_window_Sum_Gene_Mentions.txt", sep="\t", index=False)

print(f"Results saved to 'sliding_window_Sum_Gene_Mentions.txt'")
print(f"Total sum of 'Sum_Gene_Mentions' column: {total_gene_mentions}")
print(f"Cross-check: Total sum of all binary matrix values (1s in the matrix): {total_binary_sum}")

In [None]:
###### Create subset of hits only

# Change the working directory
os.chdir(output_directory)

# Load the previously saved CSV file
input_filename = f"sliding_window_gene_binary_matrix_BioBERT_MyGeneinfo_{num_articles}.csv"
BioBERT_df = pd.read_csv(input_filename)

# Drop rows where "Sum_Gene_Mentions" is 0
BioBERT_df_filteredslw = BioBERT_dfslw[BioBERT_dfslw["Sum_Gene_Mentions"] > 0]

# Count the number of remaining rows
num_filtered_rowsslw = len(BioBERT_df_filteredslw)

# Calculate the total sum of gene mentions after filtering
total_gene_mentionsslw = BioBERT_df_filteredslw["Sum_Gene_Mentions"].sum()

# Save the filtered dataframe as a new CSV file
output_filename = f"sliding_window_gene_binary_matrix_BioBERT_MyGeneinfo_hits_only{num_articles}.csv"
BioBERT_df_filteredslw.to_csv(output_filename, index=False)

# Print final row count and sum of gene mentions
print(f"Number of rows after filtering: {num_filtered_rowsslw}")
print(f"Total sum of 'Sum_Gene_Mentions' after filtering: {total_gene_mentionsslw}")
print(f"Filtered file saved as: {output_filename}")

In [None]:
# Print summary statistics
articles_wogenementionsslw = len(BioBERT_dfslw) - len(BioBERT_df_filteredslw)
print(f"{articles_wogenementionsslw:,}")
print(f"Articles before gene filtering: {len(BioBERT_dfslw):,}")
print(f"Articles without gene mentions: {articles_wogenementionsslw:,}")
print(f"Articles with gene mentions:      {len(BioBERT_df_filteredslw):,}")
print(len(BioBERT_df_filteredslw)/len(BioBERT_dfslw)*100)

In [None]:
# Transfer dataset to new variable for cleaning and normalization
BioBERT_df_filtered=BioBERT_df_filteredslw.copy()

# ===================================================

## 4) Cleaning and normalization of the new dataset

In [None]:
# Load dataset only if it's not already in memory
if "BioBERT_df_filtered" not in globals():
    print("Loading dataset from file...")
    BioBERT_df_filtered = pd.read_csv("sliding_window_gene_binary_matrix_BioBERT_MyGeneinfo_hits_only2128318.csv.csv")

# Make a copy
BioBERT_df_filtered_cleanedv1 = BioBERT_df_filtered.copy()
print(f"Length of copy:      {len(BioBERT_df_filtered_cleanedv1):,}")

In [None]:
# Start the timer
start_time = time.time()
tqdm.pandas()

### Clean PaperTitle
title_patterns = [
    r'^\d{2,3}:\s*',  # Matches "00:" to "999:"
    r'^\d{4}:\s*',  # Matches "0000:" to "9999:"
    r'^\d{5}:\s*',  # Matches "00000:" to "99999:"
    r'^#\d{3,4}\s*',  # Matches "#000" and "# 0000"
    r'^<PHONE>:\s*',  # Matches "<PHONE>: "
]
title_pattern = re.compile("|".join(title_patterns))

print(" Cleaning PaperTitle column...")
BioBERT_df_filtered_cleanedv1["PaperTitle"] = BioBERT_df_filtered_cleanedv1["PaperTitle"].astype(str).progress_apply(
    lambda x: re.sub(title_pattern, '', x)
)

### Clean Abstract
# Patterns to remove:
abstract_patterns = [
    r'^\d{1,5}\^?\s+(?=Background)',  # Matches 1-5 digit numbers before "Background", with or without "^"
    r'^\d{1,5}\s+(?=Objectives[:]?|Abstract[:]?)'  # Matches 1-5 digit numbers before "Objectives" or "Abstract"
]
abstract_pattern = re.compile("|".join(abstract_patterns), re.IGNORECASE)

print("Cleaning Abstract column...")
BioBERT_df_filtered_cleanedv1["Abstract"] = BioBERT_df_filtered_cleanedv1["Abstract"].astype(str).progress_apply(
    lambda x: re.sub(abstract_pattern, '', x)
)


# Save cleaned dataset
cleaned_file_path = "cleaned_BioBERT_data.csv"
BioBERT_df_filtered_cleanedv1.to_csv(cleaned_file_path, index=False)

# End the timer
end_time = time.time()
execution_time = round(end_time - start_time, 2)

# Display runtime and confirmation message
print(f"\n Cleaning complete! Dataset saved as '{cleaned_file_path}'.")
print(f" Total execution time: {execution_time} seconds.")

# Display first few rows
print(BioBERT_df_filtered_cleanedv1.head())
print(f"Length of full test dataset: {len(BioBERT_df_filtered_cleanedv1):,}")

In [None]:
# Investigate dataset length
len_gene_df = len(BioBERT_df_filtered_cleanedv1)

# Print output
print(f"Length of full test dataset: {len_gene_df:,}")
with open("len_gene_df.txt", "w") as file:
    file.write(str(len_gene_df))
print("File saved!")

In [None]:
# Calculate values
articles_before = len(BioBERT_df)
articles_without_genes = articles_wogenementions
articles_with_genes = len(BioBERT_df_filtered)
percentage_with_genes = (articles_with_genes / articles_before) * 100

# Save output to a text file in the current directory
with open("gene_NER_article_statistics.txt", "w") as file:
    file.write(f"Articles before gene filtering: {articles_before:,}\n")
    file.write(f"Articles without gene mentions: {articles_without_genes:,}\n")
    file.write(f"Articles with gene mentions:      {articles_with_genes:,}\n")
    file.write(f"Percentage of relevant articles with gene mentions: {percentage_with_genes:.2f}%\n")
    
# Print summary
print(f"Articles before gene filtering: {articles_before:,}")
print(f"Articles without gene mentions: {articles_without_genes:,}")
print(f"Articles with gene mentions:      {articles_with_genes:,}")
print(f"Percentage of relevant articles with gene mentions: {percentage_with_genes:.2f}%")
print("File saved!")

In [None]:
# final dataset:cleaned_BioBERT_data