# SciSpaCy-based gene extraction

# 1) Install libraries and load dataset

In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz 
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz
print("Success!")

In [None]:
import spacy
import scispacy
import os
import re
import time
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from mygene import MyGeneInfo
import matplotlib.pyplot as plt
from fuzzywuzzy import process, fuzz
print("Import successful!")

In [None]:
# Set the working directory and file paths
working_directory = "WORKING_DIRECTORY"
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "articles.csv"
genes_file = "genes.csv"

# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
#Load OncoMine genes file
os.chdir(input_directory)
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()
print("Genes import successful!")

# Load the articles file
os.chdir(output_directory)
if "full_articles" not in globals():
    full_articles = pd.read_csv(articles_file)
    print(f"Loaded {len(full_articles)} articles from CSV.")
else:
    print("Using preloaded full_articles from memory.")
articles = full_articles.head(100)
print("Article import successful!")
print(f"\nImported {len(articles)} articles with {len(articles.columns)} selected columns.")
print(f"Imported {len(gene_list):,} oncomine genes.")

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]
os.chdir(working_directory)
print("\nCurrent Working Directory:", os.getcwd())

In [None]:
# Save datasubset for reference!
os.chdir(working_directory)
ICIMTH_output = "ICIMTH_subset_data_file_for_analysis.csv"
articles.to_csv(ICIMTH_output)
print(f"\n Subset dataset for analysis saved as '{ICIMTH_output}'")

In [None]:
# Set working directory and load SciSpaCy models
os.chdir(working_directory)
print("Current Working Directory:", os.getcwd())
models = {
    "bionlp13cg": "en_ner_bionlp13cg_md",
    "jnlpba": "en_ner_jnlpba_md",
    "craft": "en_ner_craft_md"
}
nlp_models = {name: spacy.load(path) for name, path in models.items()}

# 2) Run NER-based SciSpaCy gene extraction

In [None]:
os.chdir(working_directory)
print("Current Working Directory:", os.getcwd())

# Define entity labels to extract for each model
MODEL_ENTITY_LABELS = {
    "bionlp13cg": "GENE_OR_GENE_PRODUCT",
    "jnlpba": "PROTEIN",
    "craft": "GGP"
}

# Initialize MyGene.info API client
mg = MyGeneInfo()

# Function to fetch gene synonyms and proteins from MyGene.info API
def get_gene_synonyms(gene_symbol):
    """Fetch known synonyms and protein products for a given gene."""
    url = f"https://mygene.info/v3/query?q={gene_symbol}&fields=symbol,alias,other_names,protein"
    try:
        response = requests.get(url).json()
        synonyms = set()
        for hit in response.get("hits", []):
            if "symbol" in hit:
                synonyms.add(hit["symbol"].upper())
            if "alias" in hit:
                synonyms.update([alias.upper() for alias in hit["alias"]])
            if "other_names" in hit:
                synonyms.update([name.upper() for name in hit["other_names"]])
            if "protein" in hit and isinstance(hit["protein"], dict) and "name" in hit["protein"]:
                synonyms.add(hit["protein"]["name"].upper())
        return synonyms
    except:
        return {gene_symbol.upper()}

expanded_gene_list = {gene.upper(): get_gene_synonyms(gene) for gene in gene_list}
print(f"Expanded gene list contains {len(expanded_gene_list)} genes with synonyms.")

# Function to normalize extracted entities using MyGene.info & Fuzzy Matching
def normalize_extracted_entities(found_terms):
    """Normalize extracted entities using MyGene API and fuzzy matching."""
    normalized_entities = set()

    for term in found_terms:
        term_upper = term.upper()
        
        if term_upper in expanded_gene_list:
            normalized_entities.add(term_upper)
            normalized_entities.update(expanded_gene_list[term_upper])
        
        else:
            match = process.extractOne(term_upper, expanded_gene_list.keys(), scorer=fuzz.ratio)
            if match:
                best_match, score = match[:2]
                if score > 85:
                    normalized_entities.add(best_match)
                    normalized_entities.update(expanded_gene_list.get(best_match, []))

    return normalized_entities

def extract_entities_scispacy(text, nlp, entity_labels):
    """Extracts entities from text using SciSpaCy, supports multiple entity labels."""
    if pd.isna(text) or len(text.strip()) == 0:
        return set()
    doc = nlp(text)
    extracted_terms = set()
    if isinstance(entity_labels, list):
        extracted_terms = {ent.text.upper() for ent in doc.ents if ent.label_ in entity_labels}
    else:
        extracted_terms = {ent.text.upper() for ent in doc.ents if ent.label_ == entity_labels}

    return extracted_terms

##### Entity Extraction #####
start_time = time.strftime("%Y-%m-%d %H:%M:%S")
start_timestamp = time.time()

print(f"Processing {len(articles)} articles with SciSpaCy models. Started at {start_time}")

# Processing articles
all_results = []
for model_name, nlp in nlp_models.items():
    entity_labels = MODEL_ENTITY_LABELS.get(model_name, "GENE_OR_GENE_PRODUCT")
    print(f"Using model: {model_name} (Extracting {entity_labels})")
    scispacy_results = []
    for index, row in tqdm(articles.iterrows(), total=len(articles), desc=f"Processing with {model_name}"):
        title = row.get("PaperTitle", "")
        abstract = row.get("Abstract", "")

        extracted_entities = extract_entities_scispacy(title, nlp, entity_labels) | extract_entities_scispacy(abstract, nlp, entity_labels)
        normalized_entities = normalize_extracted_entities(extracted_entities)

        scispacy_results.append(", ".join(normalized_entities))
    df_results = articles.copy()
    df_results["Model"] = model_name 
    df_results[model_name] = scispacy_results
    df_results["Extracted_Entities"] = df_results[model_name].apply(lambda x: x.split(", ") if isinstance(x, str) else [])
    binary_entity_data = {gene: df_results["Extracted_Entities"].apply(lambda entities: 1 if gene in entities else 0) for gene in gene_list}
    binary_entity_df = pd.DataFrame(binary_entity_data)
    df_results = pd.concat([df_results, binary_entity_df], axis=1)
    df_results["Sum_Entity_Mentions"] = binary_entity_df.sum(axis=1)
    csv_filename = f"scispacy_evaluation_{model_name}.csv"
    df_results.drop(columns=["Extracted_Entities", "Model","Entity_Label"], errors="ignore").to_csv(csv_filename, index=False)
    print(f"Model results saved as: {csv_filename}")
    all_results.append(df_results)
df_all_results = pd.concat(all_results, ignore_index=True)

summary_results = df_all_results.groupby("Model")["Sum_Entity_Mentions"].sum().reset_index()
summary_results.columns = ["Model", "Total_Entity_Mentions"]

print("\n### Extraction Summary ###")
for index, row in summary_results.iterrows():
    print(f"Model: {row['Model']} | Total Entity Mentions: {row['Total_Entity_Mentions']}")
summary_file = "sciscpacy_evaluation_summary.txt"
with open(summary_file, "w") as f:
    f.write("### Extraction Summary ###\n")
    for index, row in summary_results.iterrows():
        f.write(f"Model: {row['Model']} | Total Entity Mentions: {row['Total_Entity_Mentions']}\n")
print(f"\nExtraction summary saved in: {summary_file}")