# Extraction of cancer types
- Extract cancer types from PaperTitles and Abstracts using NER-based SciSpaCy approach
- Connect to CIIVC API and match against the extarcted cancer types
- Create a binary matrix cancer matrix
- Create output statistics and figures

# 1) Set up libraries and datasets

## 1.1) Import libraries and models

In [None]:
#Import libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import requests
import time
import re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import ast
from tqdm import tqdm
import ast
import unicodedata
import swifter
from fuzzywuzzy import fuzz
from rapidfuzz import process
import sys
import json
import logging
print("Success!")

In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz 
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz 
print("Success!")

In [None]:
import spacy
import scispacy
print(spacy.__version__)

# Load SciSpaCy model for cancer entity recognition
nlp_cancer_1 = spacy.load("en_ner_bionlp13cg_md")  # Stronger for cancer extraction
nlp_cancer_2 = spacy.load("en_ner_bc5cdr_md")  # Good for diseases extraction

# Stopwords to filter out non-relevant mentions
EXCLUDE_TERMS = {"anticancer", "cancerous", "non-cancerous", "precancerous", "cancer-related"}
print("Success!")

## 1.2) Load datasets

In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
gene_matrix = "filtered_gene_binary_matrix.csv"
full_dataset = "cleaned_BioBERT_data.csv"
print("Success!")

# Load full dataset
os.chdir(output_directory)
print("Current work directory:",os.getcwd())
full_df = pd.read_csv(full_dataset)
full_df = full_df.copy()
print(f"Length of full dataset: {len(full_df):,}")

# 2) Connect to CIVIC API for cancer types and extract DOID and MONDO

In [None]:
###### ======== All following steps are needed ! =============== ######
###### CIVIC disease extraction with synonyms from DOID ######
###### ======================================================== ######
# Define the GraphQL endpoint
url = "https://civicdb.org/api/graphql"

# Function to execute GraphQL queries
def run_query(query, variables=None):
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
    }
    response = requests.post(url, json={'query': query, 'variables': variables}, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Query failed with status code {response.status_code}")

# Fetch all disease IDs
all_disease_ids = []
end_cursor = None
while True:
    browse_query = """
    query ($after: String) {
      browseDiseases(first: 100, after: $after) {
        edges {
          node {
            id
          }
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    """
    variables = {"after": end_cursor}
    result = run_query(browse_query, variables)
    edges = result['data']['browseDiseases']['edges']
    all_disease_ids.extend([edge['node']['id'] for edge in edges])
    page_info = result['data']['browseDiseases']['pageInfo']
    if page_info['hasNextPage']:
        end_cursor = page_info['endCursor']
    else:
        break

# Retrieve synonyms for each disease with a progress bar
diseases_data = []
disease_query = """
query ($id: Int!) {
  disease(id: $id) {
    id
    name
    doid
    diseaseAliases
    diseaseUrl
  }
}
"""
print("\nFetching synonyms for each disease...")

for disease_id in tqdm(all_disease_ids, desc="Processing Diseases", unit="disease"):
    variables = {"id": disease_id}
    result = run_query(disease_query, variables)
    disease = result.get('data', {}).get('disease', {})
    if not disease:
        continue

    diseases_data.append({
        "id": disease.get("id", "N/A"),
        "name": disease.get("name", "N/A"),
        "doid": disease.get("doid", "N/A"),
        "synonyms": ", ".join(disease.get("diseaseAliases", [])) if disease.get("diseaseAliases") else "None",
        "diseaseUrl": disease.get("diseaseUrl", "N/A"),
    })
CIVIC_cancer_synonyms = pd.DataFrame(diseases_data)
CIVIC_cancer_synonyms.to_csv("CIVIC_cancer_synonyms.csv", index=False)
print("\n CSV file saved successfully: CIVIC_cancer_synonyms.csv")
print(f"\nTotal cancer types in CIViC: {CIVIC_cancer_synonyms.shape[0]}")
print(CIVIC_cancer_synonyms.head())
print("Length of dataset:",len(CIVIC_cancer_synonyms))

In [None]:
###### ======================================================================== ######
###### CIVIC disease extraction with MONDO synonyms fron CIVIC MyDiseaseInfo    ######
###### ======================================================================== ######

if "CIVIC_cancer_synonyms" in globals():
    civic_df = CIVIC_cancer_synonyms.copy()
    print("civic_df loaded from globals")
else:
    os.chdir(output_directory)
    print("Loading dataset from file...")
    civic_df = pd.read_csv(output_directory + "/CIVIC_cancer_synonyms.csv")
    
# Define the CIViC GraphQL API endpoint
CIVIC_API_URL = "https://civicdb.org/api/graphql"
cancer_types = civic_df["name"].str.strip().str.lower().unique()  # Clean and case-insensitive
# Initialize an empty list to store results
results = []

# Loop through each cancer type with a progress bar
for cancer in tqdm(cancer_types, desc="Processing Cancer Types"):
    try:
        # Get Disease ID
        typeahead_query = f"""
        query {{
          diseaseTypeahead(queryTerm: "{cancer}") {{
            id
            name
          }}
        }}
        """

        response = requests.post(CIVIC_API_URL, json={"query": typeahead_query})
        # Initialize default entry in case of failure
        entry = {
            "query_name": cancer,
            "name": "N/A",
            "doid": "N/A",
            "aliases": "N/A",
            "mondoId": "N/A",
            "mondoDef": "N/A",
            "diseaseOntologyExactSynonyms": "N/A",
            "diseaseOntologyRelatedSynonyms": "N/A",
            "doDef": "N/A",
            "doDefCitations": "N/A",
            "icd10": "N/A",
            "icdo": "N/A",
            "mesh": "N/A",
            "ncit": "N/A",
            "omim": "N/A"
        }

        if response.status_code == 200:
            data = response.json()
            diseases = data.get("data", {}).get("diseaseTypeahead", [])

            if diseases:
                # Select the first matching disease ID
                disease_id = diseases[0]["id"]
                # Query MyDiseaseInfo for details
                disease_query = f"""
                query {{
                  disease(id: {disease_id}) {{
                    name
                    doid
                    diseaseAliases
                    myDiseaseInfo {{
                      mondoId
                      mondoDef
                      diseaseOntologyExactSynonyms
                      diseaseOntologyRelatedSynonyms
                      doDef
                      doDefCitations
                      icd10
                      icdo
                      mesh
                      ncit
                      omim
                    }}
                  }}
                }}
                """
                response = requests.post(CIVIC_API_URL, json={"query": disease_query})
                if response.status_code == 200:
                    disease_data = response.json().get("data", {}).get("disease", {})

                    if disease_data:
                        entry["name"] = disease_data.get("name", "N/A")
                        entry["doid"] = disease_data.get("doid", "N/A")
                        entry["aliases"] = ", ".join(disease_data.get("diseaseAliases", [])) if disease_data.get("diseaseAliases") else "N/A"
                        if disease_data.get("myDiseaseInfo"):
                            my_disease_info = disease_data["myDiseaseInfo"]
                            entry.update({
                                "mondoId": my_disease_info.get("mondoId", "N/A"),
                                "mondoDef": my_disease_info.get("mondoDef", "N/A"),
                                "diseaseOntologyExactSynonyms": ", ".join(my_disease_info.get("diseaseOntologyExactSynonyms", [])) if my_disease_info.get("diseaseOntologyExactSynonyms") else "N/A",
                                "diseaseOntologyRelatedSynonyms": ", ".join(my_disease_info.get("diseaseOntologyRelatedSynonyms", [])) if my_disease_info.get("diseaseOntologyRelatedSynonyms") else "N/A",
                                "doDef": my_disease_info.get("doDef", "N/A"),
                                "doDefCitations": ", ".join(my_disease_info.get("doDefCitations", [])) if my_disease_info.get("doDefCitations") else "N/A",
                                "icd10": my_disease_info.get("icd10", "N/A"),
                                "icdo": my_disease_info.get("icdo", "N/A"),
                                "mesh": my_disease_info.get("mesh", "N/A"),
                                "ncit": ", ".join(my_disease_info.get("ncit", [])) if my_disease_info.get("ncit") else "N/A",
                                "omim": my_disease_info.get("omim", "N/A")
                            })
        results.append(entry)
    except Exception as e:
        print(f"Skipping {cancer} due to an error: {e}")
MONDO_df = pd.DataFrame(results)
MONDO_df.to_csv("CIVIC_cancers_with_MONDO.csv", index=False)
print("CSV file saved successfully: CIVIC_cancers_with_MONDO.csv")
print("Length of dataset:",len(MONDO_df))
if len(MONDO_df) == len(CIVIC_cancer_synonyms):
    print("Same length of CIVIC DOID and MONDO dataset")
else:
    print("Different lengths of datasets")

In [None]:
###### ======================================================================== ######
###### CIVIC disease extraction with MONDO synonyms from MONDO EMBL API.        ######
###### ======================================================================== ######
# Load ALL synonyms directly using the MONDO API, as MyDiseaseInfo on CIVIC is not comprehensive enough
# Remove commas and spaces of the MONDO derived synonyms!
def clean_individual_cancer_name(name):
    """
    Cleans an individual cancer name by:
    - Removing internal commas and hyphens while keeping spaces intact.
    """
    cleaned_name = re.sub(r"[,|-]+", " ", name)
    return " ".join(cleaned_name.split()) 

def fetch_synonyms(mondo_id):
    """
    Fetch synonyms for a given MONDO ID using the OLS API.
    Cleans the names directly while fetching.
    """
    # Handle missing or non-string values
    if pd.isna(mondo_id) or not isinstance(mondo_id, str):
        return "N/A"
    # Format the MONDO ID correctly (replace ":" with "_")
    mondo_id = mondo_id.replace(":", "_")
    # API endpoint
    url = f"https://www.ebi.ac.uk/ols/api/ontologies/mondo/terms?iri=http://purl.obolibrary.org/obo/{mondo_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            term_info = data["_embedded"]["terms"][0]
            synonyms = term_info.get("synonyms", [])

            # Clean each cancer name
            cleaned_synonyms = [clean_individual_cancer_name(name) for name in synonyms]

            # Join them back with commas
            return ", ".join(cleaned_synonyms) if cleaned_synonyms else "N/A"
        
        elif response.status_code == 404:
            return "N/A"
        else:
            return f"Error {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"

MONDO_df2 = MONDO_df.copy()  # Ensure MONDO ID is treated as a string
MONDO_df2["mondoId"] = MONDO_df2["mondoId"].astype(str)  
tqdm.pandas(desc="Fetching and cleaning synonyms from MONDO")
MONDO_df2["new_synonyms"] = MONDO_df2["mondoId"].progress_apply(fetch_synonyms)
MONDO_df2.to_csv("CIVIC_MONDO_df_with_cleaned_synonyms.csv", index=False)
print("\n CSV file saved successfully: CIVIC_MONDO_df_with_cleaned_synonyms.csv")
print(MONDO_df2.head(10))

In [None]:
# Check and filter for cancers in the "name" column
cancer_row = MONDO_df2[MONDO_df2["name"].str.lower() == "prostate cancer"]

if not cancer_row.empty:
    # Extract and print the synonyms
    synonyms = cancer_row["new_synonyms"].values[0] if "new_synonyms" in cancer_row else "No synonyms available"
    aliases = cancer_row["aliases"].values[0] if "aliases" in cancer_row else "No aliases available"
    print("Synonyms for 'Cancer'")
    print(synonyms)
    print("\n Aliases for 'Cancer'")
    print(aliases)
else:
    print("'Cancer' not found in the dataset.")

In [None]:
###### ===================================================================== ######
###### Display full CIVIC and MONDO dataset and merge alisases and synonyms  ######
###### ===================================================================== ######
# Load ALL synonyms directly using the MONDO API (as MyDiseaseInfo on CIVIC is not comprehensive enough)
if "MONDO_df2" in globals():
    CIVIC_DOID_MONDO_merged = MONDO_df2.copy()
    print("MONDO_df loaded from globals")
else:
    os.chdir(output_directory)
    print("Loading dataset from file...")
    CIVIC_DOID_MONDO_merged = pd.read_csv(output_directory + "/CIVIC_MONDO_df_with_new_synonyms.csv")

# Ensure all values are consistently lists
def ensure_list(x):
    """Ensures the value is a list. Converts NaN, empty lists, and strings to a proper list."""
    if isinstance(x, float) and pd.isna(x):
        return []
    elif isinstance(x, str):
        return [syn.strip() for syn in x.split(",") if syn.strip()]
    elif isinstance(x, list): 
        return [syn.strip() for syn in x if isinstance(syn, str) and syn.strip()]
    else:
        return [] 

# Apply ensure_list to each column
for col in ['aliases', 'new_synonyms']:
    CIVIC_DOID_MONDO_merged[col] = CIVIC_DOID_MONDO_merged[col].map(ensure_list, na_action='ignore')

# Function to merge and clean synonyms
def merge_synonyms(row):
    """Merges synonyms from multiple columns, removes duplicates, and ignores empty values."""
    all_synonyms = set() 
    for col in ['aliases', 'new_synonyms']:
        all_synonyms.update(row[col]) 
    return ", ".join(sorted(all_synonyms)) if all_synonyms else None 

# Apply the function to create the new "synonyms" column
CIVIC_DOID_MONDO_merged["synonyms"] = CIVIC_DOID_MONDO_merged.apply(merge_synonyms, axis=1)
CIVIC_DOID_MONDO_merged.to_csv("CIVIC_MONDO_with_synonyms.csv", index=False)
print("CSV file saved successfully: CIVIC_MONDO_with_synonyms.csv")
print("Length of dataset:",len(CIVIC_DOID_MONDO_merged))
print(CIVIC_DOID_MONDO_merged.columns)

In [None]:
columns_to_check = ["synonyms", "aliases", "new_synonyms"]
# Calculate lengths and display results
lengths_df = CIVIC_DOID_MONDO_merged[columns_to_check].astype(str).applymap(len)
print(lengths_df)

In [None]:
# Cleaning: Remove N/As and short, general abbreviations 
def clean_individual_cancer_name(name):
    """
    Cleans an individual cancer name by:
    - Removing internal hyphens ("-").
    - Removing text inside parentheses "(...)".
    - Ensuring proper spacing.
    """
    name = re.sub(r"[-]+", " ", name)  # Replace hyphens with spaces
    name = re.sub(r"\(.*?\)", "", name)  # Remove text inside parentheses
    return " ".join(name.split())  # Remove extra spaces

def clean_synonyms(synonyms):
    """
    Cleans the synonyms string by:
    - Removing "N/A"
    - Removing the standalone word "familial"
    - Removing internal hyphens ("-")
    - Removing text inside parentheses "(...)"
    - Removing duplicate entries (case-insensitive)
    """
    if pd.isna(synonyms) or not isinstance(synonyms, str):
        return "N/A" 
    # Split synonyms into a list
    cancer_names = [word.strip() for word in synonyms.split(",")]
    # Clean each cancer name
    cleaned_names = [
        clean_individual_cancer_name(name) for name in cancer_names
        if name.lower() != "familial" and name.lower() != "n/a"
    ]

    # Remove duplicates (case-insensitive while preserving original casing)
    seen = set()
    unique_names = []
    for name in cleaned_names:
        lower_name = name.lower()
        if lower_name not in seen:
            seen.add(lower_name)
            unique_names.append(name)
    # Join back into a comma-separated string
    return ", ".join(unique_names)

# Create a new cleaned dataset instead of modifying the original
CIVIC_cancer_final = CIVIC_DOID_MONDO_merged.copy()
CIVIC_cancer_final["synonyms"] = CIVIC_cancer_final["synonyms"].apply(clean_synonyms)
print("Successfully cleaned synonyms (case-insensitive duplicates removed, parentheses removed)!")

CIVIC_cancer_final.to_csv("CIVIC_cancer_final.csv", index=False)
print("\nCSV file saved successfully: CIVIC_cancer_final.csv")
print("Length of dataset:", len(CIVIC_cancer_final))

print("\nSample of cleaned synonyms:")
print(CIVIC_cancer_final.head(10)[["synonyms"]])

In [None]:
###### ======================================================= ######
###### Display and investigate the final CIVIC cancer dataset  ######
###### ======================================================= ######
if "CIVIC_cancer_final" in globals():
    CIVIC_cancer_final = CIVIC_cancer_final.copy()
    print("Dataset loaded from globals")
else:
    os.chdir(output_directory)
    print("Loading dataset from file...")
    CIVIC_cancer_final = pd.read_csv(output_directory + "/CIVIC_cancer_final.csv")
total_cancers = CIVIC_cancer_final['name'].nunique()
print(f"Total unique cancer types: {total_cancers}")
print("\nSummary of Cancer Types:")
print(CIVIC_cancer_final['name'].value_counts())
prostate_cancer_df_syn = CIVIC_cancer_final[CIVIC_cancer_final['name'].str.contains("prostate", case=False, na=False)]
print("\nFiltered List of Prostate Cancer Types:")
print(prostate_cancer_df_syn)

In [None]:
###### ======================================================= ######
###### Display and investigate the final CIVIC cancer dataset  ######
###### ======================================================= ######
if "CIVIC_cancer_final" in globals():
    CIVIC_cancer_final = CIVIC_cancer_final.copy()
    print("Dataset loaded from globals")
else:
    os.chdir(output_directory)
    print("Loading dataset from file...")
    CIVIC_cancer_final = pd.read_csv(output_directory + "/CIVIC_cancer_final.csv")
total_cancers = CIVIC_cancer_final['name'].nunique()
print(f"Total unique cancer types: {total_cancers}")
print("\nSummary of Cancer Types:")
print(CIVIC_cancer_final['name'].value_counts())
prostate_cancer_df_syn = CIVIC_cancer_final[CIVIC_cancer_final['name'].str.contains("prostate", case=False, na=False)]
print("\nFiltered List of Prostate Cancer Types:")
print(prostate_cancer_df_syn)
CIVIC_cancer_final["name"] = CIVIC_cancer_final["name"].str.strip()
CIVIC_cancer_final.loc[CIVIC_cancer_final["name"].str.lower() == "doid:0080202", "name"] = "Adenoid Cystic Carcinoma"
CIVIC_cancer_final["name"] = CIVIC_cancer_final["name"].apply(lambda x: x.title() if isinstance(x, str) else x)
CIVIC_cancer_final.to_csv("CIVIC_cancer_final.csv", index=False)
print("\nUpdated CIVIC_cancer_final.csv has been saved.")

# 3) Extract cancer mentions using SciScpyCy

In [None]:
#SciSpaCy models en_ner_bionlp13cg_md and en_ner_bc5cdr_md, refined for "tags"
#nlp_cancer_1 = spacy.load("en_ner_bionlp13cg_md")  # Recognizes 'CANCER'
#nlp_cancer_2 = spacy.load("en_ner_bc5cdr_md")  # Recognizes 'DISEASE'

# Stopwords to filter out false positives
EXCLUDE_TERMS = {"anticancer", "anti cancer", "anti-cancer", "anti-tumor",
                 "antitumor", "anti tumor", "cancerous", "non-cancerous", "precancerous", "cancer-related"}

# Function to clean extracted terms
def clean_term(term):
    """Cleans extracted terms by normalizing Unicode and standardizing hyphens."""
    term = term.lower().strip()
    term = unicodedata.normalize("NFKC", term) 
    term = re.sub(r'[-‐–—]', ' ', term)  
    return term

def extract_cancer_mentions(text):
    """Extracts cancer-related mentions by first filtering entity labels before applying text filtering."""
    if pd.isna(text) or text.strip() == "":
        return []
    extracted_terms = set()  # Remove duplicates
    # SciSpaCy model 1 (en_ner_bionlp13cg_md) - Only extract entities labeled as CANCER
    doc1 = nlp_cancer_1(text)
    for ent in doc1.ents:
        if ent.label_ == "CANCER":  # Only consider "CANCER" labeled entities
            extracted_terms.add(clean_term(ent.text))
    # SciSpaCy model 2 (en_ner_bc5cdr_md) - Only extract entities labeled as DISEASE
    doc2 = nlp_cancer_2(text)
    for ent in doc2.ents:
        if ent.label_ == "DISEASE": 
            term = clean_term(ent.text)
            if "cancer" in term or "tumor" in term: 
                extracted_terms.add(term)
    # Remove false positives
    filtered_terms = {term for term in extracted_terms if term not in EXCLUDE_TERMS}
    return list(filtered_terms)
tqdm.pandas()
def apply_cancer_extraction(df):
    """Apply cancer extraction to the 'PaperTitle' and 'Abstract' columns using a progress bar."""
    df["Extracted_Cancer_Terms"] = (
        df["PaperTitle"].astype(str) + " " + df["Abstract"].astype(str)
    ).progress_apply(extract_cancer_mentions)
    return df

# ===========================
# Start timing
start_time = time.time()
df = full_df.copy()
print(f"Processing {len(df)} rows...")
df = apply_cancer_extraction(df)

end_time = time.time()
runtime = end_time - start_time

output_path = os.path.join(output_directory, "Extracted_Cancer_Terms.csv")
df.to_csv(output_path, index=False)
runtime_log_path = os.path.join(output_directory, "running_time_cancer_extraction.txt")
with open(runtime_log_path, "w") as f:
    f.write(f"Total execution time: {runtime:.2f} seconds\n")
print(f"File saved successfully: {output_path}")
print(f"Execution time logged in: {runtime_log_path}")

In [None]:
## Investigate dataset of cancers
print(len(df))
print(f"File saved successfully: {output_path}")
print(f"Execution time logged in: {runtime_log_path}")

# ============================================================

# 4) Match extracted cancers with CIVIC for binary matrix creation

## 4.1)  Load synonym dataset

In [None]:
# Load CIVIC cancer synonyms dataset
os.chdir(output_directory)
print("Loading dataset from file...")
CIVIC_cancer_final = pd.read_csv(output_directory + "/CIVIC_cancer_final.csv")

# Make a copy instead of overwriting the original dataset
CIVIC_cancer_synonyms_expanded = CIVIC_cancer_final.copy() #for further MONDO expansion
print("Original count of cancer types:", len(CIVIC_cancer_synonyms))

# Remove rows where "name" is exactly "cancer"
CIVIC_cancer_synonyms_expanded = CIVIC_cancer_synonyms_expanded[
    ~CIVIC_cancer_synonyms_expanded["name"].str.lower().isin(["cancer", "carcinoma", "tumor", "tumour","solid tumors, advanced","solid tumor"])
]
print("Original count of cancer types after 'cancer' removal:", len(CIVIC_cancer_synonyms_expanded))

new_cancers = pd.DataFrame([
    {
        "id": "new1",
        "name": "metastatic castration-resistant prostate cancer",
        "doid": "N/A",
        "synonyms": ["mCRPC", "advanced prostate cancer", "CRPC", "castration-resistant PC",
                     "advanced-stage prostate cancer", 
                     "androgen-independent prostate cancer",
                     "androgen independent prostate cancer",
                     "metastatic castrate resistant prostate cancer",
                     "metastatic castrate,resistant prostate cancer",
                     "hormone-refractory prostate cancer",
                     "bone metastatic castration resistant prostate cancer",
                     "bone metastatic castration-resistant prostate cancer",
                     "metastatic prostate cancer castration resistant",
                     "bone metastatic crpc",
                     "metastatic prostate cancer castration-resistant",
                     "metastatic castration-resistance prostate cancer", "androgen-independent prostate cancer",
                     "metastatic castrate-resistant prostate cancer",
                     "brca1 mutated metastatic castration resistant prostate cancer"]
    },
    {
        "id": "new2",
        "name": "metastatic hormone-sensitive prostate cancer",
        "doid": "N/A",
        "synonyms": ["mHSPC", "castrationsensitive prostate cancer", 
                     "hormone-sensitive metastatic prostate cancer", "HSPC",
                     "hormone sensitive prostate cancer", 
                     "androgen-dependent prostate cancer",
                     "androgen dependent prostate cancer",
                     "metastatic castration-sensitive prostate cancer",
                     "androgen-dependent metastatic prostate cancer", "hormone-naïve prostate cancer"]
    }
])

CIVIC_cancer_synonyms_expanded = pd.concat([CIVIC_cancer_synonyms_expanded, new_cancers], ignore_index=True)
CIVIC_cancer_synonyms_expanded["synonyms"] = CIVIC_cancer_synonyms_expanded["synonyms"].apply(
    lambda x: [syn.strip() for syn in x.replace(";", ",").split(",") if syn.strip()] if isinstance(x, str) else x
)
print("Updated count of cancer types:", len(CIVIC_cancer_synonyms_expanded))
print("Successful exection!")

In [None]:
# Select only the columns "name" and "synonyms"
CIVIC_cancer_synonyms_expanded2 = CIVIC_cancer_synonyms_expanded[["name", "synonyms"]]
CIVIC_cancer_synonyms_expanded2.to_csv("CIVIC_cancer_synonyms_expanded2.csv", index=False)
print("\nThe file 'CIVIC_cancer_synonyms_expanded2.csv' has been saved successfully.")
print(CIVIC_cancer_synonyms_expanded2)

In [None]:
#Check CIVIC dataframe
print("Is 'carcinoma' still in the dataset?", any(CIVIC_cancer_synonyms_expanded["name"].str.lower() == "carcinoma"))
print("Is 'cancer' still in the dataset?", any(CIVIC_cancer_synonyms_expanded["name"].str.lower() == "cancer"))
print("Is 'tumor' still in the dataset?", any(CIVIC_cancer_synonyms_expanded["name"].str.lower() == "tumor"))
print("Is 'solid tumors, advanced' still in the dataset?", any(CIVIC_cancer_synonyms_expanded["name"].str.lower() == "solid tumors, advanced"))
print("Is 'solid tumor' still in the dataset?", any(CIVIC_cancer_synonyms_expanded["name"].str.lower() == "solid tumor"))
sorted_names = sorted(CIVIC_cancer_synonyms_expanded["name"].unique(), key=len)
print(CIVIC_cancer_synonyms_expanded2.columns)

## 4.2) Define function and clean extracted cancer terms

In [None]:
# ======================================================================
# Load Dataset with extracted cancer terms from SciSpyCy
# ======================================================================
print("Loading dataset from file...")
cancer_mapping_df = pd.read_csv(output_directory + "/Extracted_Cancer_Terms.csv")
print("Dataset loaded from csv.")
print(f"Length of dataset copy: {len(cancer_mapping_df):,}")

# Ensure "Extracted_Cancer_Terms" is in list format
cancer_mapping_df["Extracted_Cancer_Terms"] = cancer_mapping_df["Extracted_Cancer_Terms"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

original_length = len(cancer_mapping_df)
print(cancer_mapping_df.columns)
print("Original length:",original_length)

In [None]:
### ==================================================== ###
### CLEANING OF EXTRACTED CANCER FOR BETTER MATCHING     ###
### ==================================================== ###
tqdm.pandas()
cancer_mapping_df_cleaning = cancer_mapping_df.copy()

# List of prefixes to remove
prefixes_to_remove = [
    "age", 
    "aggressive", 
    "advance", "advanced",
    "alk positive", 
    "alk+", 
    "ampullary", 
    "anti", 
    "antitumor",
    "brca associated",
    "brca1 associated",
    "brca2 associated",
    "brca1", 
    "brca2", 
    "brca 1", 
    "brca 2", 
    "brca-mutated", 
    "brca-positive", 
    "braf", 
    "brca1-mutated",
    "brca mutant", "brca1 mutant","brca2 mutant",
    "brca2 altered",
    "brca1/2", 
    "brca2-mutated", 
    "brca deficient",
    "brca linked", 
    "brca negative", 
    "brca positive", 
    "cell line", 
    "cell lines", 
    "iii", "iiii", "iiiii", "iiiv", "iiv",
    "chemoresistant", "circulating",
    "disease", "diseases",
    "cisplatin", 
    "dna alterations", 
    "double negative", 
    "early stage",
    "germline",  
    "human",
    "kras mutant", "kras",
    "hypoxic",
    "intercellular",
    "late stage", 
    "line", 
    "lines",
    "mcf", "mcf 7", 
    "mediastinal", 
    "membrane", 
    "methylated", 
    "mice", 
    "mouse", 
    "murine",
    "mutation", 
    "n myc", 
    "organoids", 
    "pain", 
    "parp", 
    "patient", 
    "patients", 
    "platinum-sensitive", 
    "predisposition", 
    "sample", "samples", 
    "senescent", 
    "silenced",  
    "somatic", 
    "specific", 
    "specimen", 
    "specimens", 
    "stage", 
    "tissue", "tissues", "tnbc", "tumor dna", 
    "tp53", "p53",
    "tumor specimen", "tumorigenic", "xenograft", 
    "xenografts",
    "brcawt", "dna", "biopsis","abstract",
    "therapy related",
    "moderate",
    "advance stage","advanced stage",
    "chemo", "chemotherapy", "ilc"
]

# Define replacements for specific words
word_replacements = {
    "cancers": "cancer",
    "tumour": "tumor",
    "tumours": "tumor",
    "tumors": "tumor",
    "carcinomas": "carcinoma",
    "gliomas": "glioma",
    "adenocarcinomas": "adenocarcinoma"
}

def clean_cancer_terms(terms):
    cleaned_terms = []
    
    # Iterate over the terms in the list
    for term in terms:
        term = term.strip().lower()  # Make term lowercase for consistency
        # Remove unwanted characters like +, ., /, and numbers at the beginning
        term = re.sub(r"^[\+\.,\d\(\)\-\s]+", "", term)  # Remove numbers, commas, plus signs, spaces, and dots at the start
        # Remove unwanted prefixes using regex with word boundaries (\b)
        for prefix in prefixes_to_remove:
            term = re.sub(rf"\b{prefix}\b\s*", "", term)  # Remove exact match of prefix, ensuring it's a whole word
        # Replace specified words based on the `word_replacements` dictionary
        for old_word, new_word in word_replacements.items():
            term = term.replace(old_word, new_word)
        # Remove duplicate terms within the same string (e.g., 'gastric cancer gastric cancer' -> 'gastric cancer')
        words = term.split()
        unique_words = list(dict.fromkeys(words))  # Using dict.fromkeys to remove duplicates while keeping order
        cleaned_term = " ".join(unique_words)
        # Add cleaned term only if it's not empty
        if cleaned_term.strip():
            cleaned_terms.append(cleaned_term.strip())
    return cleaned_terms

# Function to clean the list of cancer terms
def clean_extracted_terms(value):
    if isinstance(value, list):
        return clean_cancer_terms(value)
    else:
        return []
    
# Apply cleaning to the "Extracted_Cancer_Terms" column
cancer_mapping_df_cleaning["Extracted_Cancer_Terms_cleaned"] = cancer_mapping_df_cleaning["Extracted_Cancer_Terms"].progress_apply(clean_extracted_terms)
cancer_mapping_df_cleaning.rename(columns={
    "Extracted_Cancer_Terms": "Extracted_Cancer_Terms_old", 
    "Extracted_Cancer_Terms_cleaned": "Extracted_Cancer_Terms"
}, inplace=True)

print(cancer_mapping_df_cleaning[['Extracted_Cancer_Terms_old', 'Extracted_Cancer_Terms']].head())
cancer_mapping_df_cleaning.to_csv("cleaned_extracted_cancer_terms.csv", index=False)
print("\n\n--> Full dataset has been saved to 'cleaned_extracted_cancer_terms.csv'.")
cancer_mapping_df_cleaning.head(2000).to_csv("cleaned_extracted_cancer_terms_2000.csv", index=False) #subset
print("--> The first 2000 rows have been saved to 'cleaned_extracted_cancer_terms_2000'.")

## 4.3) Run binary matrix creation, and create datasets

In [None]:
# Check and load dataset
df = cancer_mapping_df_cleaning.copy()
print(f"Length of dataset copy: {len(cancer_mapping_df_cleaning):,}")

# Ensure "Extracted_Cancer_Terms" is in list format
df["Extracted_Cancer_Terms"] = df["Extracted_Cancer_Terms"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Normalize cancer terms for better matching
def normalize_cancer_term(term):
    if not isinstance(term, str):
        return term
    term = term.lower().strip()
    term = unicodedata.normalize("NFKC", term)
    term = re.sub(r"\(.*?\)", "", term).strip()
    term = re.sub(r'\b(cancers|carcinoma|carcinomas|tumor|tumors)\b', 'cancer', term)
    term = re.sub(r'[\.\d/]+$', '', term)
    term = re.sub(r'\s+', ' ', term).strip()
    return term

# Expand synonyms
def expand_synonyms(df):
    expanded_rows = []
    for _, row in df.iterrows():
        cancer_name = normalize_cancer_term(row["name"])
        expanded_rows.append({"cancer_type": cancer_name, "standard_name": cancer_name})
        if isinstance(row["synonyms"], list):
            synonyms = [normalize_cancer_term(syn) for syn in row["synonyms"] if syn.strip()]
            for synonym in synonyms:
                expanded_rows.append({"cancer_type": synonym, "standard_name": cancer_name})
    return pd.DataFrame(expanded_rows)

# Load CIVIC dataset
CIVIC_cancer_synonyms_expanded2 = pd.read_csv("CIVIC_cancer_synonyms_expanded2.csv")
expanded_cancer_df = expand_synonyms(CIVIC_cancer_synonyms_expanded2)
# Remove rows where "cancer_type" is exactly "cancer"
expanded_cancer_df = expanded_cancer_df[expanded_cancer_df["cancer_type"].str.lower() != "cancer"]

# Create a dictionary mapping names and synonyms to standardized names
cancer_mapping = dict(zip(expanded_cancer_df["cancer_type"], expanded_cancer_df["standard_name"]))

# Map extracted cancer terms
no_match_count = 0
open("unmatched_terms_log.txt", "w").close()
def map_cancer_terms(terms):
    global no_match_count
    if not isinstance(terms, list):
        return []
    mapped_terms = []
    unmatched_terms = set()
    for term in terms:
        normalized_term = normalize_cancer_term(term)
        if normalized_term in cancer_mapping:
            mapped_terms.append(cancer_mapping[normalized_term])
        else:
            no_match_count += 1
            unmatched_terms.add(normalized_term)

    if unmatched_terms:
        with open("unmatched_terms_log.txt", "a") as f:
            for term in unmatched_terms:
                f.write(term + "\n")

    return list(set(mapped_terms))
tqdm.pandas()
df["Mapped_Cancer_Terms"] = df["Extracted_Cancer_Terms"].progress_apply(map_cancer_terms)

# Identify Unmatched Terms
df["Unmatched_Cancer_Terms"] = df["Extracted_Cancer_Terms"].apply(
    lambda terms: [term for term in terms if term not in cancer_mapping]
)

# Clean Unmatched Terms (Remove "Metastatic" at Start or End)
def clean_and_remap_unmatched_terms(terms):
    global no_match_count
    if not isinstance(terms, list):
        return []
    remapped_terms = []
    for term in terms:
        cleaned_term = re.sub(r"^metastatic\s+|\s+metastatic$", "", term, flags=re.IGNORECASE).strip()
        if cleaned_term in cancer_mapping:
            remapped_terms.append(cancer_mapping[cleaned_term])
        else:
            pass
    return list(set(remapped_terms))
df["Remapped_Cancer_Terms"] = df["Unmatched_Cancer_Terms"].apply(clean_and_remap_unmatched_terms)

# Combine Original & Remapped Matches
df["Final_Mapped_Cancer_Terms"] = df["Mapped_Cancer_Terms"] + df["Remapped_Cancer_Terms"]
df["Final_Mapped_Cancer_Terms"] = df["Final_Mapped_Cancer_Terms"].apply(lambda x: list(set(x)))

# Generate and save binary matrix
binary_rows = [{"PaperId": row["PaperId"], "mapped_cancer": term} for _, row in df.iterrows() for term in row["Final_Mapped_Cancer_Terms"]]
binary_df = pd.DataFrame(binary_rows)
binary_matrix = binary_df.pivot_table(index="PaperId", columns="mapped_cancer", aggfunc=lambda x: 1, fill_value=0)
df_binary_matrix = df.merge(binary_matrix, on="PaperId", how="left").fillna(0)
columns_to_remove = [col for col in df_binary_matrix.columns if col.lower() == "cancer"]
df_binary_matrix.drop(columns=columns_to_remove, inplace=True)
print(f"Removed columns: {columns_to_remove}")
df_binary_matrix["Cancer_Type_Sum"] = df_binary_matrix.iloc[:, df.shape[1]:].sum(axis=1)
df_binary_matrix.to_csv("binary_cancer_matrix_with_sum.csv", index=False) #save full matrix

# Drop rows where Cancer_Type_Sum == 0
df_binary_matrix_filtered = df_binary_matrix[df_binary_matrix["Cancer_Type_Sum"] > 0]
df_binary_matrix_filtered_zero = df_binary_matrix[df_binary_matrix["Cancer_Type_Sum"] == 0]

# Save the filtered dataset
df_binary_matrix_filtered.to_csv("binary_cancer_matrix_filtered.csv", index=False)
df_binary_matrix_filtered_zero.to_csv("binary_cancer_matrix_filtered_zero.csv", index=False)

# Calculate lengths of datasets
original_length = len(df_binary_matrix)
filtered_length = len(df_binary_matrix_filtered)
dropped_length = original_length - filtered_length
dropped_percentage = (dropped_length / original_length) * 100
cancer_percentage = (filtered_length / original_length) * 100

# Print dataset sizes with thousand separators
print(f"\nOriginal dataset length: {original_length:,}")
print(f"Filtered dataset length (Cancer_Type_Sum > 0): {filtered_length:,} ({cancer_percentage:.2f}%)")
print(f"Dropped rows (Cancer_Type_Sum == 0): {dropped_length:,} ({dropped_percentage:.2f}%)")

# Verify if numbers add up correctly
if original_length == (filtered_length + dropped_length):
    print("\n--> The numbers add up correctly!")
else:
    print("\n--> Warning: The numbers do NOT add up correctly!")

print("\nBinary matrix saved successfully!")
print(f"\nTotal 'No match found for' messages: {no_match_count:,}")

In [None]:
# Check for cancer columns
column_name = "solid tumor"
if column_name in df_binary_matrix_filtered.columns:
    print(f"Column '{column_name}' IS PRESENT in the DataFrame.")
else:
    print(f"Column '{column_name}' is NOT present in the DataFrame.")
    
# Investigate unmatched terms
with open("unmatched_terms_log.txt", "r") as file:
    unmatched_terms = file.readlines()
unmatched_terms = [term.strip() for term in unmatched_terms]
unmatched_terms.sort()
with open("unmatched_terms_log_sorted.txt", "w") as file:
    for term in unmatched_terms:
        file.write(term + "\n")

print("Unmatched terms sorted and saved to 'unmatched_terms_log_sorted.txt'.")

# =========================================================

# 5) Cancer parent mapping

In [None]:
CIVIC_cancer_synonyms = "CIVIC_cancer_synonyms.csv"
CIVIC_cancer_synonyms_df = pd.read_csv(CIVIC_cancer_synonyms)
print("Dataset loaded successfully!")
print("Length of dataset", len(CIVIC_cancer_synonyms_df))
print(CIVIC_cancer_synonyms_df)

In [None]:
CIVIC_cancer_synonyms = "CIVIC_cancer_synonyms.csv"
CIVIC_cancer_synonyms_df = pd.read_csv(CIVIC_cancer_synonyms)
print("Dataset loaded successfully!")
print("Length of dataset", len(CIVIC_cancer_synonyms_df))

##### Conntect to Disease Ontology API
# Configure logging to suppress debug/info messages
logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
# Function to extract DOID from the diseaseURL column
def extract_doid(url):
    if pd.notna(url):
        match = re.search(r"DOID:(\d+)", url)  # Extracts numbers after "DOID:"
        if match:
            return match.group(1)  # Returns the extracted DOID (keeps leading zeros)
    return None

# Apply the extraction function
CIVIC_cancer_synonyms_df["clean_doid"] = CIVIC_cancer_synonyms_df["diseaseUrl"].apply(extract_doid)

# Disease Ontology API base URL
DO_API_BASE = "https://api.disease-ontology.org/v1/terms/DOID:{}"

# Lists to track issues
missing_doid_records = []
failed_api_requests = []
successful_but_no_nci = []

# Function to get NCI ID, Parent DOIDs, and Parent Names
def get_nci_and_parents(doid, row):
    if not doid:
        missing_doid_records.append(row)  # Track rows with missing DOIDs
        return None, None, None  # Skip if DOID is missing

    doid_formatted = f"DOID:{doid}"
    url = DO_API_BASE.format(doid_formatted)
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            # Extract NCI ID
            nci_id = None
            if "xrefs" in data:
                for ref in data["xrefs"]:
                    if ref.startswith("NCI:"):
                        nci_id = ref.replace("NCI:", "")
            
            # Extract parent DOIDs
            parents = data.get("parents", [])
            parent_doids = ", ".join(parents) if parents else None
            
            # Extract parent names
            parent_names = []
            for parent_doid in parents:
                parent_name = get_parent_name(parent_doid)
                if parent_name:
                    parent_names.append(parent_name)
            
            parent_names_str = ", ".join(parent_names) if parent_names else None

            # If no NCI ID, track it
            if nci_id is None:
                successful_but_no_nci.append(row)
            return nci_id, parent_doids, parent_names_str
        else:
            logging.warning(f"API Request Failed for DOID {doid_formatted} - Status Code: {response.status_code}")
            failed_api_requests.append(row)
            return None, None, None
    except Exception as e:
        logging.error(f"Error fetching DOID {doid_formatted}: {e}")
        failed_api_requests.append(row)
        return None, None, None

# Function to fetch parent disease name from DOID
def get_parent_name(doid):
    url = DO_API_BASE.format(doid)
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            return data.get("name", "Unknown") 
        else:
            return "Unknown"
    except Exception:
        return "Unknown" 

# Manually iterate with progress bar
nci_ids = []
parent_doid_list = []
parent_name_list = []

for _, row in tqdm(CIVIC_cancer_synonyms_df.iterrows(), total=len(CIVIC_cancer_synonyms_df), desc="Fetching NCI IDs & Parents"):
    nci_id, parent_doids, parent_names = get_nci_and_parents(row["clean_doid"], row)
    nci_ids.append(nci_id)
    parent_doid_list.append(parent_doids)
    parent_name_list.append(parent_names)

# Assign the extracted data to the DataFrame
CIVIC_cancer_synonyms_df["NCI_ID"] = nci_ids
CIVIC_cancer_synonyms_df["parent_DOIDs"] = parent_doid_list 
CIVIC_cancer_synonyms_df["parent_names"] = parent_name_list

# Save updated data
output_file = "CIVIC_cancer_synonyms_with_NCI_and_parents.csv"
CIVIC_cancer_synonyms_df.to_csv(output_file, index=False)

# Save missing DOIDs separately
if missing_doid_records:
    missing_doid_df = pd.DataFrame(missing_doid_records)
    missing_doid_df.to_csv("CIVIC_cancers_missing_doid.csv", index=False)
    print(f"Saved {len(missing_doid_records)} records with missing DOIDs to 'CIVIC_cancers_missing_doid.csv'")

# Save failed API requests separately
if failed_api_requests:
    failed_api_df = pd.DataFrame(failed_api_requests)
    failed_api_df.to_csv("CIVIC_cancers_failed_requests.csv", index=False)
    print(f"Saved {len(failed_api_requests)} records that failed API requests to 'CIVIC_cancers_failed_requests.csv'")

# Save successful API calls that had no NCI ID separately
if successful_but_no_nci:
    successful_but_no_nci_df = pd.DataFrame(successful_but_no_nci)
    successful_but_no_nci_df.to_csv("CIVIC_cancers_successful_but_no_NCI.csv", index=False)
    print(f"Saved {len(successful_but_no_nci)} records that had successful API responses but no NCI ID to 'CIVIC_cancers_successful_but_no_NCI.csv'")

# Summary output with detailed breakdown
print(f"\nSummary:")
print(f"- Total records processed: {len(CIVIC_cancer_synonyms_df)}")
print(f"- Records with missing DOIDs: {len(missing_doid_records)}")
print(f"- Records that failed API requests: {len(failed_api_requests)}")
print(f"- Records with successful API responses but no NCI ID: {len(successful_but_no_nci)}")
print(f"- Total records with missing NCI_IDs: {len(missing_doid_records) + len(failed_api_requests) + len(successful_but_no_nci)}")
print(f"Processing complete! Updated file saved as {output_file}")

In [None]:
print(CIVIC_cancer_synonyms_df) # Convert "name" column to a list
# Issue in the database, replace "DOID:0080202" with properly capitalized "Adenoid Cystic Carcinoma"
CIVIC_cancer_synonyms_df.loc[CIVIC_cancer_synonyms_df["name"].str.lower() == "doid:0080202", "name"] = "Adenoid Cystic Carcinoma"
# Save updated data
output_file = "CIVIC_cancer_synonyms_with_NCI_and_parents.csv"
CIVIC_cancer_synonyms_df.to_csv(output_file, index=False)

In [None]:
## Cancer Cleaning

file_path = "CIVIC_cancer_synonyms_with_NCI_and_parents.csv"
if "CIVIC_cancer_synonyms_df" not in globals() or CIVIC_cancer_synonyms_df is None:
    try:
        CIVIC_cancer_synonyms_df = pd.read_csv(file_path)
        print("Dataset successfully loaded.")
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        CIVIC_cancer_synonyms_df = None

# Remove unnecessary words at the start
words_to_remove = ["malignant", "childhood", "adult", "juvenile"]
CIVIC_cancer_synonyms_df["parent_names"] = CIVIC_cancer_synonyms_df["parent_names"].str.replace(
    r"\b(?:malignant|childhood|adult|juvenile)\b", "", case=False, regex=True
).str.strip()

# Ensure all text transformations are complete, then clean spaces
CIVIC_cancer_synonyms_df["parent_names"] = (
    CIVIC_cancer_synonyms_df["parent_names"]
    .str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
    .str.strip()  # Remove leading and trailing spaces
)

# Create the "final_parent" column
def process_parent_name(row):
    parent_names = row["parent_names"]
    # If parent_names is empty or None, copy "name" column
    if pd.isna(parent_names) or parent_names.strip() == "":
        return row["name"]
    # Split parent_names by commas and strip spaces
    parent_list = [p.strip() for p in parent_names.split(",")]
    # Define unwanted last names
    unwanted_terms = {"cancer", "carcinoma", "adenocarcinoma", "unknown", "cell type cancer", 
                      "cell type benign neoplasm","autosomal dominant disease", "autosomal recessive disease", "syndrome"} 
    # Iterate backwards to find a valid name
    for parent in reversed(parent_list):
        if parent.lower() not in unwanted_terms:
            return parent
    # If no valid name is found (everything was "Unknown"), fallback to "name" column
    return row["name"]
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df.apply(process_parent_name, axis=1)


# Remove rows where both "name" and "final_parent" are only "cancer", "carcinoma", or "solid tumor"
CIVIC_cancer_synonyms_df = CIVIC_cancer_synonyms_df[
    ~(
        (CIVIC_cancer_synonyms_df["name"].str.lower().isin(["doid:", "cancer", "carcinoma", "solid cancer",
                                                            "solid tumor", "solid tumors, advanced"])) & 
        (CIVIC_cancer_synonyms_df["final_parent"].str.lower().isin(["none", "cancer", "carcinoma", "solid cancer",
                                                            "solid tumor", "solid tumors, advanced"]))
    )
]

# Remove unnecessary words from "final_parent" as well
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df["final_parent"].str.replace(
    r"\b(?:malignant|childhood|adult|juvenile|benign)\b", "", case=False, regex=True
).str.strip()

# Clean "final_parent" column (Replacing terms)
CIVIC_cancer_synonyms_df["final_parent"] = (
    CIVIC_cancer_synonyms_df["final_parent"]
    .str.replace("neoplasm", "cancer", case=False, regex=True)
    .str.replace("carcinoma", "cancer", case=False, regex=True)
    .str.replace("adenocarcinoma", "cancer", case=False, regex=True)
    .str.replace("adenocancer", "cancer", case=False, regex=True)
)
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df["final_parent"].str.replace("leukaemia", "leukemia", case=False, regex=True)

# Keyword mapping dictionary
keyword_mapping = {
    "skin": "skin cancer",
    "breast": "breast cancer",
    "mammary": "breast cancer",
    "mucinous": "mucinous cancer",
    "lung": "lung cancer",
    "bronchio": "lung cancer",
    "spindle cell": "spindle cell cancer",  
    "acute myeloid leukemia": "acute myeloid leukemia",    
    "salivary gland": "salivary gland cancer",
    "renal": "renal cancer",
    "prostate": "prostate cancer",
    "pancreatic": "pancreatic cancer",
    "medulloblastoma": "medulloblastoma",
    "lymphoblastic leukemia": "lymphoblastic leukemia",
    "myeloid": "myeloid cancer",
    "kidney": "kidney cancer",
    "head and neck": "head and neck cancer",
    "gastrointestinal": "gastrointestinal cancer",
    "neurofibroma": "neurofibroma",
    "ovarian": "ovarian cancer",
    "ovary": "ovarian cancer",
    "supratentorial ependymoma": "supratentorial ependymoma",
    "cervix": "cervix cancer", 
    "cervical": "cervix cancer",
    "colorectal": "colon cancer",
    "colon": "colon cancer",
    "endometri": "endometrial cancer",
    "melano": "melanoma",
    "laryngeal": "laryngeal cancer",
    "glioma": "glioma",
    "bone": "bone cancer",
    "osteo": "bone cancer",
    "peritoneal": "peritoneal cancer",
    "astrocytoma": "astrocytoma",
    "glioblastoma": "glioblastoma",
    "gastric": "gastric cancer",
    "mesothelioma": "mesothelioma",
    "esophag": "esophagus cancer",
    "thyroid": "thyroid cancer",
    "thymus": "thymus cancer",
    "uterus": "uterine cancer",
    "spinal": "spinal cancer",
    "hepatocellular": "liver cancer",
    "cholangio": "cholangio cancer",
    "bile duct": "biliary tract cancer",
    "glioblastoma": "glioma",
    "gliosarcoma": "glioma",
    "myeloid cancer": "hematologic cancer",
    "myeloproliferative cancer": "hematologic cancer",
    "myelodysplastic syndrome": "hematologic cancer",
    "essential thrombocythemia": "hematologic cancer",
    "myelofibrosis": "hematologic cancer",
    "barrett": "esophagus cancer",
    "fraumeni": "li-fraumeni syndrome",
    "liposarcoma": "liposarcoma",
    "papillary": "papillary cancer"
}

# Apply keyword mapping
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df["final_parent"].apply(
    lambda x: next((v for k, v in keyword_mapping.items() if isinstance(x, str) and k in x.lower()), x)
)

# Classify leukemia and lymphoma
def classify_leukemia_lymphoma(name):
    if isinstance(name, str):
        name_lower = name.lower()
        has_leukemia = any(word in name_lower for word in ["leukemia", "leukemic"])
        has_lymphoma = "lymphoma" in name_lower
        if has_leukemia and has_lymphoma:
            return "leukemia/lymphoma"
        elif has_leukemia:
            return "leukemia"
        elif has_lymphoma:
            return "lymphoma"
    return name 

# Apply classification
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df["final_parent"].apply(classify_leukemia_lymphoma)
# Advanced keyword mapping with regex
def apply_keyword_mapping(name):
    if not isinstance(name, str):
        return name 
    name_lower = name.lower() 

    # Special handling for "bladder" to avoid "gallbladder" conflicts
    if re.search(r'\bbladder\b', name_lower) and "gallbladder" not in name_lower:
        return "bladder cancer"

    # Special handling for "gallbladder"
    if "gallbladder" in name_lower:
        return "gallbladder cancer"

    # General keyword mapping using regex for exact word matching
    for keyword, replacement in keyword_mapping.items():
        if re.search(rf'\b{re.escape(keyword)}\b', name_lower):  
            return replacement
    return name 
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df["final_parent"].apply(apply_keyword_mapping)

# Convert the "final_parent" column to all lowercase for consistency
CIVIC_cancer_synonyms_df["final_parent"] = CIVIC_cancer_synonyms_df["final_parent"].str.lower()
print(f"\nFinal dataset saved with all lowercase.")

# Extract unique values and their counts
final_parent_counts = CIVIC_cancer_synonyms_df["final_parent"].value_counts().sort_index()

# Display total count of unique values
print(f"Total unique 'final_parent' values after cleaning: {len(final_parent_counts)}\n")

final_parent_df = pd.DataFrame(final_parent_counts).reset_index()
final_parent_df.columns = ["final_parent", "count"]

for parent_name, count in final_parent_counts.items():
    print(f"{parent_name}: {count}")

output_file = "CIVIC_cancer_synonyms_cleaned.csv"
CIVIC_cancer_synonyms_df.to_csv(output_file, index=False)
print(f"\nUpdated dataset saved as: {output_file}")

# =========================================================

# 6) Cancer orruance calculation

In [None]:
# Check if df_binary_matrix_filtered is in globals() and load if necessary
if 'df_binary_matrix_filtered' in globals():
    df_binary_matrix_filtered_figure = df_binary_matrix_filtered.copy()
    print("Dataset loaded from globals.")
else:
    os.chdir(output_directory)
    df_binary_matrix_filtered_figure = pd.read_csv("binary_cancer_matrix_filtered.csv")
    print("Dataset loaded from file.....")
print(df_binary_matrix_filtered_figure.columns.tolist())

In [None]:
# Identify relevant columns for calculation (from 'Final_Mapped_Cancer_Terms' to 'Cancer_Type_Sum')
relevant_columns = df_binary_matrix_filtered_figure.columns[df_binary_matrix_filtered_figure.columns.get_loc('Final_Mapped_Cancer_Terms') + 1: df_binary_matrix_filtered_figure.columns.get_loc('Cancer_Type_Sum')]

# Calculate the sum of each of these columns (which is the count of occurrences for each cancer type)
cancer_counts = df_binary_matrix_filtered_figure[relevant_columns].sum().sort_values(ascending=False)

# Rank the columns by their sum from highest to lowest (most frequently mentioned cancer types)
cancer_counts_sorted = cancer_counts.sort_values(ascending=False)
print(cancer_counts_sorted)
cancer_counts_df = cancer_counts.reset_index()
cancer_counts_df.columns = ["cancer_type", "count"]
print("\nCancer type occurrence df:")
print(cancer_counts_df.head())

In [None]:
# Look for cancer typess
cancer_search_term = "primary"

# Check if the cancer type exists in the 'cancer_type' column (case-insensitive)
cancer_exists = cancer_counts_df['cancer_type'].str.contains(cancer_search_term, case=False, na=False)
if cancer_exists.any():
    found_cancer = cancer_counts_df[cancer_exists]
    print("Found the cancer type in the dataset:")
    print(found_cancer)
else:
    print(f"The cancer type '{cancer_search_term}' is not found in the dataset.")
    
# Get all unique cancer types from the 'cancer_type' column
all_cancer_types = cancer_counts_df['cancer_type'].unique()
# Display the list of all unique cancer types
print("List of all unique cancer types:")
print(all_cancer_types)

In [None]:
CIVIC_file = "CIVIC_cancer_synonyms_cleaned.csv"
CIVIC_cancer_synonyms_df = pd.read_csv(CIVIC_file)

# Function to standardize cancer terms clearly
def standardize_cancer_terms(text):
    if isinstance(text, str):
        text = text.lower().strip()
        text = text.replace("adenocarcinoma", "cancer")
        text = text.replace("tumor", "cancer")
        text = text.replace("carcinoma", "cancer")
    return text

# Apply standardization explicitly
cancer_counts_df["cancer_type"] = cancer_counts_df["cancer_type"].apply(standardize_cancer_terms)
CIVIC_cancer_synonyms_df["name"] = CIVIC_cancer_synonyms_df["name"].apply(standardize_cancer_terms)
CIVIC_cancer_synonyms_df["synonyms"] = CIVIC_cancer_synonyms_df["synonyms"].astype(str).apply(standardize_cancer_terms)

# Function to find final_parent clearly using names/synonyms
def find_final_parent(cancer_type, synonyms_df):
    match = synonyms_df.loc[synonyms_df["name"] == cancer_type, "final_parent"]
    if not match.empty:
        return match.values[0]
    for _, row in synonyms_df.iterrows():
        if isinstance(row["synonyms"], str):
            synonyms_list = [syn.strip().lower() for syn in row["synonyms"].split(",")]
            if cancer_type in synonyms_list:
                return row["final_parent"]
    return None
cancer_counts_df["final_parent"] = cancer_counts_df["cancer_type"].apply(
    lambda x: find_final_parent(x, CIVIC_cancer_synonyms_df)
)

def apply_keyword_mapping_if_missing(row, keyword_mapping):
    if pd.isna(row["final_parent"]):
        for keyword, replacement in keyword_mapping.items():
            if keyword in row["cancer_type"]:
                return replacement
    return row["final_parent"]

cancer_counts_df["final_parent"] = cancer_counts_df.apply(
    lambda row: apply_keyword_mapping_if_missing(row, keyword_mapping), axis=1
)

def classify_leukemia_lymphoma(final_parent, cancer_type):
    """Ensure proper classification for 'leukemia' and 'lymphoma'."""
    # Convert to lowercase for consistent comparison
    final_parent_lower = final_parent.lower() if isinstance(final_parent, str) else ""
    cancer_type_lower = cancer_type.lower() if isinstance(cancer_type, str) else ""

    # Check for occurrences in either final_parent or cancer_type
    has_lymphoma = "lymphoma" in final_parent_lower or "lymphoma" in cancer_type_lower
    has_leukemia = "leukemia" in final_parent_lower or "leukemia" in cancer_type_lower

    # Determine classification
    if has_leukemia and has_lymphoma:
        return "leukemia/lymphoma"
    elif has_leukemia:
        return "leukemia"
    elif has_lymphoma:
        return "lymphoma"
    return final_parent
cancer_counts_df["final_parent"] = cancer_counts_df.apply(
    lambda row: classify_leukemia_lymphoma(row["final_parent"], row["cancer_type"]), axis=1
)
cancer_counts_df = cancer_counts_df[
    ~(
        (cancer_counts_df["cancer_type"].str.lower().isin(["doid:", "cancer", "carcinoma", "solid cancer",
                                                            "solid tumor", "solid tumors, advanced"]))
)
]

cancer_counts_df.to_csv("cancer_counts_of_all_cancer_types.csv", index=False)
print("CSV file saved successfully as 'cancer_counts_of_all_cancer_types.csv'")
print("Length of dataset:", len(cancer_counts_df))
print(cancer_counts_df)

In [None]:
# Create cancer category occurrences

cancer_counts_df = pd.read_csv("cancer_counts_of_all_cancer_types.csv")
cancer_df_length = len(cancer_df)
cancer_category_occurrences = cancer_counts_df.groupby("final_parent", as_index=False)["count"].sum()
cancer_category_occurrences = cancer_category_occurrences.sort_values(by="count", ascending=False)
other_cancers = cancer_category_occurrences[cancer_category_occurrences["count"] < 300]
other_cancers_sum = other_cancers["count"].sum()
cancer_category_occurrences = cancer_category_occurrences[cancer_category_occurrences["count"] >= 300]
other_cancers_row = pd.DataFrame({"final_parent": ["other cancers"], "count": [other_cancers_sum]})
cancer_category_occurrences = pd.concat([cancer_category_occurrences, other_cancers_row], ignore_index=True)
total_mentions = cancer_df_length
cancer_category_occurrences["percentage"] = ((cancer_category_occurrences["count"] / total_mentions) * 100).round(2)

print("\nSummed Cancer Category Occurrences:")
print("Length of category dataset:", len(cancer_category_occurrences))
print(cancer_category_occurrences)

cancer_category_occurrences.to_csv("cancer_category_occurrences_with_percentages.csv", index=False)
print("\nCSV file saved successfully as 'cancer_category_occurrences_with_percentages.csv'")

In [None]:
####### FINAL OUTPUT SUMMARY ######

full_dataset = "cleaned_BioBERT_data.csv"
full_df = pd.read_csv(full_dataset)
filtered_dataset = "binary_cancer_matrix_filtered.csv"
df_binary_matrix_filtered = pd.read_csv(filtered_dataset)

total_screened_articles=len(full_df)
total_articles_with_specific_cancer_types=len(df_binary_matrix_filtered)
percentage_specific_cancer_articles = (total_articles_with_specific_cancer_types / total_screened_articles) * 100

number_of_specific_cancer_types=len(cancer_counts_df)
number_of_cancer_categories=len(cancer_category_occurrences)

# Create a formatted summary string
summary_text = f"""
######################## FINAL OUTPUT SUMMARY ########################

Total screened articles:               {total_screened_articles:,}
Total articles with specific cancers:  {total_articles_with_specific_cancer_types:,} ({percentage_specific_cancer_articles:.2f}%)
----------------------------------------------------------------------
Number of specific cancer types:       {number_of_specific_cancer_types:,}
Number of cancer categories:           {number_of_cancer_categories:,}

######################################################################
"""
print(summary_text)
summary_file_path = "final_output_cancer_type_summary.txt"
with open(summary_file_path, "w") as file:
    file.write(summary_text)
print(f"Summary has been saved to {summary_file_path}.")

# Create matching of final parents and create new columns

In [None]:
# Ensure correct working directory
os.chdir(output_directory)

print("Loading datasets...")
# Load datasets
CIVIC_file = "CIVIC_cancer_synonyms_cleaned.csv"
CIVIC_cancer_synonyms_df = pd.read_csv(CIVIC_file)
cancer_df = pd.read_csv("binary_cancer_matrix_filtered.csv")

print("Datasets loaded successfully.")

# Count total columns at the beginning
total_columns_initial = len(cancer_df.columns)

# Define columns to ignore (metadata columns that are NOT cancer mentions for final parent matching)
ignore_columns = {"PaperTitle", "Citations", "CoFoS", "coFoS", "Lang", "Authors", 
                  "Abstract", "Language", "PubYear", "PubDate", "BioBERT", "Cancer_Type_Sum"}

# Count columns to be analyzed (excluding ignored ones)
cancer_columns = [col for col in cancer_df.columns if col not in ignore_columns]
total_cancer_columns_initial = len(cancer_columns)

print(f"Total columns at start: {total_columns_initial}")
print(f"Total cancer-related columns at start (excluding ignored ones): {total_cancer_columns_initial}")

# Function to standardize cancer terms
def standardize_cancer_terms(text):
    if isinstance(text, str):
        text = text.lower().strip()
        text = text.replace("adenocarcinoma", "cancer")
        text = text.replace("tumor", "cancer")
        text = text.replace("carcinoma", "cancer")
    return text
print("Standardizing cancer terms in CIVIC dataset...")
CIVIC_cancer_synonyms_df["name"] = CIVIC_cancer_synonyms_df["name"].apply(standardize_cancer_terms)
CIVIC_cancer_synonyms_df["synonyms"] = CIVIC_cancer_synonyms_df["synonyms"].astype(str).apply(standardize_cancer_terms)
print("Standardization complete.")

# Function to find final parent using names and synonyms
def find_final_parent(cancer_type, synonyms_df):
    match = synonyms_df.loc[synonyms_df["name"] == cancer_type, "final_parent"]
    if not match.empty:
        return match.values[0]
    for _, row in synonyms_df.iterrows():
        if isinstance(row["synonyms"], str):
            synonyms_list = [syn.strip().lower() for syn in row["synonyms"].split(",")]
            if cancer_type in synonyms_list:
                return row["final_parent"]
    return None

# Map each cancer column to its final parent
print("Mapping cancer types to their final parent...")
cancer_type_to_final_parent = {}
for col in tqdm(cancer_columns, desc="Processing Cancer Types"):
    parent = find_final_parent(col, CIVIC_cancer_synonyms_df)
    if parent:
        cancer_type_to_final_parent[col] = parent + "_finalparent"

# Remove unmapped cancer types
cancer_type_to_final_parent = {k: v for k, v in cancer_type_to_final_parent.items() if v is not None}
print(f"Mapped {len(cancer_type_to_final_parent)} cancer types to final parents.")

unique_final_parents = list(set(cancer_type_to_final_parent.values()))
df_final_binary_matrix = cancer_df.copy()
print("Initializing final parent binary matrix...")
new_columns = pd.DataFrame(0, index=cancer_df.index, columns=unique_final_parents)
df_final_binary_matrix = pd.concat([df_final_binary_matrix, new_columns], axis=1)

# Populate the final parent columns based on existing cancer mentions
print("Processing binary matrix transformation...")
for cancer_type, final_parent in tqdm(cancer_type_to_final_parent.items(), desc="Updating Binary Matrix"):
    df_final_binary_matrix[final_parent] |= cancer_df[cancer_type].fillna(0).astype(int)

# Count total columns at the end
total_columns_final = len(df_final_binary_matrix.columns)
# Count columns to be analyzed at the end (excluding ignored ones)
cancer_columns_final = [col for col in df_final_binary_matrix.columns if col not in ignore_columns]
total_cancer_columns_final = len(cancer_columns_final)

print(f"Total columns at end: {total_columns_final}")
print(f"Total cancer-related columns at end (excluding ignored ones): {total_cancer_columns_final}")

# Save the transformed binary matrix with a progress bar
csv_filename = "final_parent_binary_matrix.csv"
chunk_size = 5_000  # Number of rows per chunk

print("Writing final parent binary matrix to CSV with progress tracking...")

with open(csv_filename, "w", encoding="utf-8", newline="") as f:
    df_final_binary_matrix.iloc[:0].to_csv(f, index=False) 
    for start in tqdm(range(0, len(df_final_binary_matrix), chunk_size), desc="Saving to CSV"):
        df_final_binary_matrix.iloc[start:start+chunk_size].to_csv(f, index=False, header=False, mode="a")

print(f"CSV file saved successfully as '{csv_filename}'.")
print("\nPreview of Final Parent Binary Matrix:")
print(df_final_binary_matrix.head())

In [None]:
# Identify columns that end with "_finalparent"
print(len(df_final_binary_matrix))
final_parent_columns = [col for col in df_final_binary_matrix.columns if col.endswith("_finalparent")]
rows_all_zero_finalparent = df_final_binary_matrix[final_parent_columns].sum(axis=1) == 0
num_rows_all_zero = rows_all_zero_finalparent.sum()
print(f"Number of rows where all '_finalparent' columns are 0: {num_rows_all_zero}")
if num_rows_all_zero > 0:
    print("\nRows where all '_finalparent' columns are 0:")
    print(df_final_binary_matrix[rows_all_zero_finalparent][["PaperId"] + final_parent_columns])

In [None]:
# Load the Final Parent Binary Matrix CSV if it's not in memory
file_path = "final_parent_binary_matrix.csv"
print("Reading final parent binary matrix from CSV...")
df_final_binary_matrix = pd.read_csv(file_path)
print("CSV loaded successfully.")

# Identify columns that end with "_finalparent"
final_parent_columns = [col for col in df_final_binary_matrix.columns if col.endswith("_finalparent")]

# Calculate the sum for each final parent column
final_parent_sums = df_final_binary_matrix[final_parent_columns].sum().sort_values(ascending=False)
print("\nTop 20 Final Parent Sums:")
print(final_parent_sums.head(20))

In [None]:
# Load the CSV file
file_path = "cancer_counts_of_all_cancer_types.csv"
df_cancer_occurrences = pd.read_csv(file_path)

# Print dataset information
print(f"Total rows: {len(df_cancer_occurrences)}")
print(f"Total columns: {len(df_cancer_occurrences.columns)}")
print("\nColumn Names:")
print(df_cancer_occurrences.columns.tolist())

print("\nFirst 5 Rows:")
print(df_cancer_occurrences.head(20))

# 7) Figure creation

In [None]:
# Select the top 20 most mentioned cancer types based on categories
top_20_cancers = cancer_category_occurrences.nlargest(20, "count")
# Define colors, making "prostate cancer" dark green
colors = ["darkgreen" if cancer == "prostate cancer" else "royalblue" for cancer in top_20_cancers["final_parent"]]
max_y_value = top_20_cancers["count"].max()
plt_ylim = max_y_value * 1.2
plt.figure(figsize=(12, 6))
bars = plt.bar(top_20_cancers["final_parent"], top_20_cancers["count"], color=colors)
plt.xlabel("Cancer type", fontsize=12, fontweight="bold")
plt.ylabel("Number of mentions", fontsize=12, fontweight="bold")
plt.title(f"Most frequently mentioned cancer types\n in {total_articles_with_specific_cancer_types:,} "
          f"({percentage_specific_cancer_articles:.2f}%) articles", fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.ylim(0, plt_ylim)
plt.grid(axis="y", linestyle="--", alpha=0.7)
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height * 1.02, f"{int(height):,}", 
             ha="center", va="bottom", fontsize=10, rotation=45)
plt.show()

In [None]:
###### Detailed figure of cancer types #####

#Only keep the top 20 cancer types
publication_number = len(df_binary_matrix_filtered)
top_20_cancer_counts = cancer_counts_sorted.head(20)

# Plot the figure (bar plot) for top 20 cancer types
plt.figure(figsize=(12, 8))
bars = top_20_cancer_counts.plot(kind='bar', color='skyblue', edgecolor='black')
# Highlight "cancer of interest" with dark green color and bold label
for i, bar in enumerate(bars.patches):
    if top_20_cancer_counts.index[i] == "prostate cancer":
        bar.set_facecolor('darkgreen')
        bars.get_xticklabels()[i].set_fontweight('bold')
plt.title(f'Top 20 cancer types mentioned in publications (Total publications: {publication_number:,})', fontsize=16)
plt.xlabel('Cancer type', fontsize=12)
plt.ylabel('Number of publications', fontsize=12)
formatter = FuncFormatter(lambda x, _: f'{int(x):,}')
plt.gca().yaxis.set_major_formatter(formatter)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# =========================================================

# 6) Evaluation and other NER-based models for cacner type extraction

### 3.1) SciSpaCy "en_ner_bionlp13cg_md"

In [None]:
tqdm.pandas()
def extract_cancer_mentions(text):
    if pd.isna(text) or text.strip() == "": return []
    doc = nlp_cancer(text)
    cancer_terms = [ent.text.lower() for ent in doc.ents if "cancer" in ent.text.lower() or "tumor" in ent.text.lower()]
    return list(set(cancer_terms))
start_time = time.time()
df = full_df.copy()
df.loc[:, "Extracted_Cancer_Terms"] = (df["PaperTitle"].astype(str) + " " + df["Abstract"].astype(str)).progress_apply(extract_cancer_mentions)
end_time = time.time()
runtime = end_time - start_time
output_path = os.path.join(output_directory, "Extracted_Cancer_Terms.csv")
df.to_csv(output_path, index=False)
runtime_log_path = os.path.join(output_directory, "running_time_cancer_extraction.txt")
with open(runtime_log_path, "w") as f: f.write(f"Total execution time: {runtime:.2f} seconds\n")

print(f"File saved successfully: {output_path}")
print(f"Execution time logged in: {runtime_log_path}")

### 3.2) SciSpaCy "en_ner_bionlp13cg_md" and "en_ner_bc5cdr_md"

In [None]:
EXCLUDE_TERMS = {"anticancer","anti cancer", "anti-cancer", "anti-tumor", "metastasis", "metastases", "antitumor", "anti tumor", "cancerous", "non-cancerous", "precancerous", "cancer-related"}
def clean_term(term): return term.lower().strip()
def extract_cancer_mentions(text):
    if pd.isna(text) or text.strip() == "": return []
    extracted_terms = set()
    doc1 = nlp_cancer_1(text)
    doc2 = nlp_cancer_2(text)
    for ent in doc1.ents:
        term = clean_term(ent.text)
        if "cancer" in term or "tumor" in term: extracted_terms.add(term)
    for ent in doc2.ents:
        term = clean_term(ent.text)
        if "cancer" in term or "tumor" in term: extracted_terms.add(term)
    filtered_terms = {term for term in extracted_terms if term not in EXCLUDE_TERMS}
    return list(filtered_terms)
start_time = time.time()
tqdm.pandas()
full_df.loc[:, "Extracted_Cancer_Terms"] = (full_df["PaperTitle"].astype(str) + " " + full_df["Abstract"].astype(str)).progress_apply(extract_cancer_mentions)
end_time = time.time()
runtime = end_time - start_time
output_path = os.path.join(output_directory, "Extracted_Cancer_Terms_Hybrid.csv")
full_df.to_csv(output_path, index=False)
runtime_log_path = os.path.join(output_directory, "running_time_cancer_extraction_hybrid.txt")
with open(runtime_log_path, "w") as f: f.write(f"Total execution time: {runtime:.2f} seconds\n")
    
print(f"File saved successfully: {output_path}")
print(f"Execution time logged in: {runtime_log_path}")

### 3.3 Combine SciSpaCy approach with Swifter to speed up

In [None]:
# Combine the two models and swifter
EXCLUDE_TERMS = {"anticancer","anti cancer", "anti-cancer", "anti-tumor", "metastasis", "metastases", "antitumor", "anti tumor", "cancerous", "non-cancerous", "precancerous", "cancer-related"}
def clean_term(term):
    term = term.lower().strip()
    term = unicodedata.normalize("NFKC", term)
    term = re.sub(r'[-‐–—]', ' ', term)
    return term
def extract_cancer_mentions(text):
    if pd.isna(text) or text.strip() == "": return []
    extracted_terms = set()
    doc1 = nlp_cancer_1(text)
    doc2 = nlp_cancer_2(text)
    for ent in doc1.ents + doc2.ents:
        term = clean_term(ent.text)
        if "cancer" in term or "tumor" in term: extracted_terms.add(term)
    filtered_terms = {term for term in extracted_terms if term not in EXCLUDE_TERMS}
    return list(filtered_terms)
def apply_cancer_extraction(df):
    df["Extracted_Cancer_Terms"] = (df["PaperTitle"].astype(str) + " " + df["Abstract"].astype(str)).swifter.apply(extract_cancer_mentions)
    return df
start_time = time.time()
tqdm.pandas()
df = full_df.copy()
df = apply_cancer_extraction(df)
end_time = time.time()
runtime = end_time - start_time
output_path = os.path.join(output_directory, "Extracted_Cancer_Terms.csv")
df.to_csv(output_path, index=False)
runtime_log_path = os.path.join(output_directory, "running_time_cancer_extraction.txt")
with open(runtime_log_path, "w") as f: f.write(f"Total execution time: {runtime:.2f} seconds\n")

print(f"File saved successfully: {output_path}")
print(f"Execution time logged in: {runtime_log_path}")

In [None]:
# Disease Onotlogy matching
API_TIME_PER_TERM = 0.01
def get_disease_ontology_name(term):
    """Queries Disease Ontology API to find the standardized disease name."""
    url = f"https://www.ebi.ac.uk/ols/api/search?q={term}&ontology=doid"
    response = requests.get(url)
    if response.status_code == 200:
        results = response.json()
        if results["response"]["numFound"] > 0:
            return results["response"]["docs"][0]["label"]
    return term

def standardize_extracted_terms(terms):
    """Standardizes extracted cancer terms using Disease Ontology with a progress bar."""
    if not isinstance(terms, list) or len(terms) == 0:
        return []
    standardized_terms = []
    for term in tqdm(terms, desc="Standardizing Cancer Terms", unit="term"):
        standardized_name = get_disease_ontology_name(term)
        standardized_terms.append(standardized_name)
        time.sleep(API_TIME_PER_TERM)
    return list(set(standardized_terms)) 

# Select rows for testing
df_test = df.head(1000).copy()

total_terms = df_test["Extracted_Cancer_Terms"].explode().dropna().nunique()
estimated_time_sec = total_terms * API_TIME_PER_TERM
estimated_time_min = estimated_time_sec / 60
print(f"Estimated time required for the test: {estimated_time_sec:.2f} seconds (~{estimated_time_min:.2f} minutes)")
tqdm.pandas()
df_test["Standardized_Cancer_Terms"] = df_test["Extracted_Cancer_Terms"].progress_apply(standardize_extracted_terms)

df_test.to_csv(os.path.join(output_directory, "Test_Standardized_Cancer_Terms.csv"), index=False)
print("Test run completed! Standardized cancer terms saved successfully!")