# SM-based extraction of therapies via CIVIC API
- Extract treatments, drugs and therapies from PaperTitles and Abstracts
- Connect to CIIVC API and match against the extracted drugs
- Create a binary matrix
- Create output statistics and figures

# 1) Set up libraries and datasets

## 1.1) Import libraries and models

In [None]:
#Import libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import requests
import time
import re
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import ast
import datetime
from tqdm import tqdm
import unicodedata
import swifter
from fuzzywuzzy import fuzz
from rapidfuzz import process
print("Success!")

## 1.2) Load datasets

In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
gene_matrix = "filtered_gene_binary_matrix.csv"
full_dataset = "cleaned_BioBERT_data.csv"
print("Success!")

In [None]:
# Load the articles file
os.chdir(output_directory)
print("Current Work Directory:",os.getcwd())
full_df = pd.read_csv(full_dataset)
print(f"Length of full test dataset: {len(full_df):,}")

## 1.3) Connect to CIVIC API for therapies

In [None]:
# Define the GraphQL endpoint
url = "https://civicdb.org/api/graphql"

# Define an empty list to store all therapies
all_therapies = []
end_cursor = None

while True:
    # Define the GraphQL query with pagination and therapyAliases
    query = f"""
    {{
      browseTherapies(first: 100, after: "{end_cursor if end_cursor else ''}") {{
        edges {{
          node {{
            id
            name
            therapyUrl
            ncitId
            therapyAliases
          }}
        }}
        pageInfo {{
          hasNextPage
          endCursor
        }}
      }}
    }}
    """

    # Set the request headers
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
    }
    # Send the POST request
    response = requests.post(url, json={'query': query}, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if 'errors' in data:
            print("Errors:", data['errors'])
            break

        # Extract the therapies data
        therapies = [edge['node'] for edge in data['data']['browseTherapies']['edges']]
        all_therapies.extend(therapies)

        # Get pagination info
        page_info = data['data']['browseTherapies']['pageInfo']
        
        # If there are more pages, update the cursor and continue
        if page_info['hasNextPage']:
            end_cursor = page_info['endCursor']
        else:
            break
    else:
        print(f"Error: Unable to fetch data (Status Code: {response.status_code})")
        break

# Convert to DataFrame
CIVIC_therapies = pd.DataFrame(all_therapies)
CIVIC_therapies = CIVIC_therapies.sort_values(by="name", ascending=True).reset_index(drop=True)


# Print summary
print(f"Total Therapies in CIViC: {CIVIC_therapies.shape[0]}")
print(CIVIC_therapies.head())
print("\nColumns:", CIVIC_therapies.columns)

os.chdir(output_directory)
CIVIC_therapies.to_csv("CIVIC_therapies_with_aliases.csv", index=False)
print("\nCSV file saved successfully: CIVIC_therapies_with_aliases.csv")

In [None]:
# Investigate therapy dataset
# Define the therapy name to search for (case insensitive but exact)
therapy_name = "Trastuzumab".lower()

if "CIVIC_therapies" in globals():
    CIVIC_therapies = CIVIC_therapies.copy()
else:
    CIVIC_therapies = pd.read_csv("CIVIC_therapies_with_aliases.csv")
filtered_data = CIVIC_therapies[CIVIC_therapies["name"].str.lower() == therapy_name]
therapy_aliases_list = filtered_data["therapyAliases"].tolist()

print("- Therapy-associated aliases and synonyms:", therapy_aliases_list)

######### Search aliases #########
# Define alias keyword to search for (case-insensitive exact match)
alias_keyword = "Herceptin".lower()

# Convert therapyAliases to lists if they are stored as strings
CIVIC_therapies["therapyAliases"] = CIVIC_therapies["therapyAliases"].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Ensure exact match (case-insensitive, but no partial matching)
filtered_data = CIVIC_therapies[
    CIVIC_therapies["therapyAliases"].apply(
        lambda aliases: any(alias.lower() == alias_keyword for alias in aliases) if isinstance(aliases, list) else False
    )
]

therapy_names_list = filtered_data["name"].tolist()
print("\n\n- Alias-associated therapy:", therapy_names_list)

## Filter out unspecific abbreviation in order to avoid unspecific string matched, e.g., 
- Treatment "RT" (Raditation therapy) with RT-QPCR
- "Lysin"e with the amino acid lysine
- "Immunological" with string such as Immunological profiling 
- Inhibitor, as this can be anything

In [None]:
# Clean based on character length
CIVIC_therapies_filtered = CIVIC_therapies.copy()
# Set the character limit dynamically
char_limit = 4
CIVIC_therapies_filtered["filteredAliases"] = CIVIC_therapies_filtered["therapyAliases"].apply(lambda aliases: [alias for alias in aliases if isinstance(alias, str) and len(alias) <= char_limit] if isinstance(aliases, list) else [])
short_aliases_df = CIVIC_therapies_filtered[CIVIC_therapies_filtered["filteredAliases"].apply(lambda x: len(x) > 0)].copy()
short_aliases_df["filteredAliases"] = short_aliases_df["filteredAliases"].apply(lambda x: ', '.join(x))
short_treamtnet_df = pd.concat([short_aliases_df]).drop_duplicates(subset=["name"]).reset_index(drop=True)
print("\nFiltered Therapies with Names or Aliases ≤", char_limit, "characters:")
print(short_treamtnet_df.head(10)[["name","filteredAliases"]])
print("Number of short therapies:", len(short_treamtnet_df))

In [None]:
# Create a new dataset 'therapyAliases_new' by filtering out aliases ≤ char_limit characters
CIVIC_therapies_filtered["therapyAliases_new"] = CIVIC_therapies_filtered["therapyAliases"].apply(lambda aliases: [alias for alias in aliases if isinstance(alias, str) and len(alias) > char_limit] if isinstance(aliases, list) else [])
CIVIC_therapies_filtered["original_alias_count"] = CIVIC_therapies_filtered["therapyAliases"].apply(lambda aliases: len(aliases) if isinstance(aliases, list) else 0)
CIVIC_therapies_filtered["new_alias_count"] = CIVIC_therapies_filtered["therapyAliases_new"].apply(len)
CIVIC_therapies_filtered = CIVIC_therapies_filtered.rename(columns={"therapyAliases": "therapyAliases_old", "therapyAliases_new": "therapyAliases"})
CIVIC_therapies_filtered.to_csv("CIVIC_therapies_filtered.csv", index=False)
print("Length of filtered CIVIC treamtment dataset:", len(CIVIC_therapies_filtered))
specific_names = short_treamtnet_df["name"].tolist()
CIVIC_therapies_removed = CIVIC_therapies_filtered[CIVIC_therapies_filtered["name"].isin(specific_names)].copy()
print("Length of removed dataset:", len(CIVIC_therapies_removed))
print(CIVIC_therapies_removed[["name","therapyAliases_old","therapyAliases","original_alias_count","new_alias_count"]])
CIVIC_therapies_removed.to_csv("check_removed_aliases.csv", index=False)


In [None]:
######## Remove further therapies that are noise, e.g., inhibitor, lysine

# Define words for removal from the "name" column
name_removal = ["lysine", "inhibitor"]

number_initial_treatments = len(CIVIC_therapies_filtered)
CIVIC_therapies_filtered = CIVIC_therapies_filtered.copy()

# Drop rows where "name" exactly matches any word in name_removal (case insensitive)
rows_before = len(CIVIC_therapies_filtered)
CIVIC_therapies_filtered = CIVIC_therapies_filtered[
    ~CIVIC_therapies_filtered["name"].str.lower().isin(name_removal)
].copy()
rows_after = len(CIVIC_therapies_filtered)

number_dropped_treatments = rows_before - rows_after
number_filtered_treatments = len(CIVIC_therapies_filtered)

CIVIC_therapies_filtered.to_csv("CIVIC_therapies_filtered.csv", index=False)

print("Summary of Filtering Process:")
print(f"Initial number of treatments: {number_initial_treatments}")
print(f"Number of dropped treatments: {number_dropped_treatments}")
print(f"Number of filtered treatments: {number_filtered_treatments}")

# Check if the numbers add up correctly
if number_initial_treatments == (number_filtered_treatments + number_dropped_treatments):
    print("--> The counts add up correctly! (Filtered + Dropped = Initial numberss)")
else:
    print("--> The counts do NOT add up! There might be an issue in filtering.")

### Add information from NCI Thesaurus API

In [None]:
##### Add hierachy and information from NCI Thesaurus ####
# Base URL for the NCI EVS REST API
base_url = 'https://api-evsrest.nci.nih.gov/api/v1'

# Function to get concept definitions
def get_concept_definitions(concept_code):
    url = f'{base_url}/concept/ncit/{concept_code}?include=definitions'
    response = requests.get(url, headers={'accept': 'application/json'})
    if response.status_code == 200:
        return [d.get('definition', 'No Definition Found') for d in response.json().get('definitions', [])]
    return []

# Function to get concept associations
def get_concept_associations(concept_code):
    url = f'{base_url}/concept/ncit/{concept_code}/associations'
    response = requests.get(url, headers={'accept': 'application/json'})
    if response.status_code == 200:
        return [
            f"{assoc.get('type', 'Unknown Type')}: {assoc.get('relatedName', 'Unknown Name')} (Code: {assoc.get('relatedCode', 'N/A')})"
            for assoc in response.json()
        ]
    return []

# Function to get inverse associations
def get_concept_inverse_associations(concept_code):
    url = f'{base_url}/concept/ncit/{concept_code}/inverseAssociations'
    response = requests.get(url, headers={'accept': 'application/json'})
    if response.status_code == 200:
        return [
            f"{inv_assoc.get('type', 'Unknown Type')}: {inv_assoc.get('relatedName', 'Unknown Name')} (Code: {inv_assoc.get('relatedCode', 'N/A')})"
            for inv_assoc in response.json()
        ]
    return []

# Function to recursively get parent concepts until a duplicate is found
def get_parent_hierarchy(concept_code):
    hierarchy = []
    seen_codes = set()
    
    while concept_code:
        if concept_code in seen_codes:  # Stop if a duplicate parent is found
            break
        seen_codes.add(concept_code)
        
        url = f'{base_url}/concept/ncit/{concept_code}/parents'
        response = requests.get(url, headers={'accept': 'application/json'})
        if response.status_code == 200:
            parents = response.json()
            if not parents:
                break
            parent = parents[0]  # Take the first parent only
            hierarchy.append((parent.get('code'), parent.get('name')))
            concept_code = parent.get('code')  # Move to next parent
        else:
            break
    return hierarchy

# Set output directory and file names
output_directory = os.getcwd()  # Uses the current working directory
input_filepath = os.path.join(output_directory, "CIVIC_therapies_filtered.csv")
output_filepath = os.path.join(output_directory, "CIVIC_therapies_with_NCIT_information.csv")

# Read input CSV
CIVIC_ncit_df = pd.read_csv(input_filepath)
CIVIC_ncit_df_processed = CIVIC_ncit_df.copy()
ncit_data = []

for _, row in tqdm(CIVIC_ncit_df.iterrows(), total=len(CIVIC_ncit_df), desc="Processing Rows", unit="row"):
    concept_code = str(row['ncitId']).strip()
    
    if not concept_code:  # Skip if ncitId is empty
        continue
    definitions = get_concept_definitions(concept_code)
    associations = get_concept_associations(concept_code)
    inverse_associations = get_concept_inverse_associations(concept_code)
    parent_hierarchy = get_parent_hierarchy(concept_code)
    parent_dict = {f'parent_{i+1}': parent_hierarchy[i][1] for i in range(len(parent_hierarchy))}
    data = {
        "ncitId": concept_code,
        "definitions": " | ".join(definitions),
        "associations": " | ".join(associations),
        "inverse_associations": " | ".join(inverse_associations),
    }
    data.update(parent_dict)  # Add parent hierarchy to the data
    ncit_data.append(data)

ncit_df = pd.DataFrame(ncit_data)
CIVIC_ncit_df_processed = CIVIC_ncit_df_processed.merge(ncit_df, on="ncitId", how="left")
CIVIC_ncit_df_processed.to_csv(output_filepath, index=False)
len_CIVIC_ncit_df_processed=len(CIVIC_ncit_df_processed)

print(f"\nData saved successfully to: {output_filepath}")
print("Length of dataset:",len_CIVIC_ncit_df_processed)

In [None]:
#### Defining the final parent ###

# Check if CIVIC_ncit_df_processed exists in memory
if "CIVIC_ncit_df_processed" in globals():
    CIVIC_ncit_df_finalparent = CIVIC_ncit_df_processed.copy()
    print("Using existing dataset in memory.")
else:
    output_filepath = os.path.join(output_directory, "CIVIC_therapies_with_NCIT_information.csv")
    CIVIC_ncit_df_finalparent = pd.read_csv(output_filepath)
    print("Loaded from CSV.")

# Print available columns
print("Available columns:", CIVIC_ncit_df_finalparent.columns.tolist())

# Identify parent columns dynamically
parent_columns = [col for col in CIVIC_ncit_df_finalparent.columns if col.startswith("parent_")]
parent_columns.sort(key=lambda x: int(x.split("_")[1]))  # Sort numerically (parent_1, parent_2, etc.)

# Define restricted values (exact matches)
restricted_parents = {
    "Drug, Food, Chemical or Biomedical Material",
    "Pharmacologic Substance",
    "Drug or Chemical by Structure",
    "Organic Chemical",
    "Therapeutic Procedure",
    "Clinical Intervention or Procedure",
    "Clinical or Research Activity",
    "Activity",
    "Antineoplastic Agent",
    "Cancer Diagnostic or Therapeutic Procedure",
    "Cancer Therapeutic Procedure",
    "Infusion Procedure",
    "Adjuvant Therapy"
}

# Function to determine the final parent
def get_final_parent(row):
    valid_parents = [row[col] for col in reversed(parent_columns) if pd.notna(row[col])]  # Start from the rightmost column (parent_n)

    for parent in valid_parents:
        # Check if the parent is in the restricted set OR starts with "Retired Concept"
        if parent not in restricted_parents and not parent.startswith("Retired Concept"):
            return parent
    return None 
CIVIC_ncit_df_finalparent["final_parent"] = CIVIC_ncit_df_finalparent.apply(get_final_parent, axis=1)
final_output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent.csv")
CIVIC_ncit_df_finalparent.to_csv(final_output_filepath, index=False)
print("\nUpdated dataset with 'final_parent' column saved to:", final_output_filepath)

In [None]:
#### Defining the final parent ###
# Check if CIVIC_ncit_df_processed exists in memory
if "CIVIC_ncit_df_processed" in globals():
    CIVIC_ncit_df_finalparent = CIVIC_ncit_df_processed.copy()
    print("Using existing dataset in memory.")
else:
    output_filepath = os.path.join(output_directory, "CIVIC_therapies_with_NCIT_information.csv")
    CIVIC_ncit_df_finalparent = pd.read_csv(output_filepath)
    print("Loaded from CSV.")

# Print available columns
print("Available columns:", CIVIC_ncit_df_finalparent.columns.tolist())

# Identify parent columns dynamically
parent_columns = [col for col in CIVIC_ncit_df_finalparent.columns if col.startswith("parent_")]
parent_columns.sort(key=lambda x: int(x.split("_")[1]))  # Sort numerically (parent_1, parent_2, etc.)

# Define restricted values (exact matches)
restricted_parents = {
    "Drug, Food, Chemical or Biomedical Material",
    "Pharmacologic Substance",
    "Drug or Chemical by Structure",
    "Organic Chemical",
    "Therapeutic Procedure",
    "Clinical Intervention or Procedure",
    "Clinical or Research Activity",
    "Activity",
    "Antineoplastic Agent",
    "Cancer Diagnostic or Therapeutic Procedure",
    "Cancer Therapeutic Procedure",
    "Infusion Procedure",
    "Adjuvant Therapy",
    "Preventive Intervention",
    "Local Therapy",
    "Cancer Prevention"
}

# Function to determine the final parent
def get_final_parent(row):
    valid_parents = [row[col] for col in reversed(parent_columns) if pd.notna(row[col])]

    for parent in valid_parents:
        # Check if the parent is in the restricted set OR starts with "Retired Concept"
        if parent not in restricted_parents and not parent.startswith("Retired Concept"):
            return parent

    # If all parents are restricted or missing, use the "name" column as the final parent
    return row["name"] if pd.notna(row["name"]) else None

# Apply function to determine "final_parent"
CIVIC_ncit_df_finalparent["final_parent"] = CIVIC_ncit_df_finalparent.apply(get_final_parent, axis=1)

# Save updated DataFrame to CSV
final_output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent.csv")
CIVIC_ncit_df_finalparent.to_csv(final_output_filepath, index=False)

# Print confirmation message
print("\nUpdated dataset with 'final_parent' column saved to:", final_output_filepath)

In [None]:
final_output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent.csv")
CIVIC_ncit_df_finalparent = pd.read_csv(final_output_filepath)
total_rows = len(CIVIC_ncit_df_finalparent)i

# Count statistics
empty_final_parent_count = CIVIC_ncit_df_finalparent["final_parent"].isna().sum()
empty_final_parent_percentage = (empty_final_parent_count / total_rows) * 100
non_empty_final_parent_count = total_rows - empty_final_parent_count
non_empty_final_parent_percentage = (non_empty_final_parent_count / total_rows) * 100
final_parent_counts = CIVIC_ncit_df_finalparent["final_parent"].value_counts()

print(f"Total rows: {total_rows}")
print(f"Empty 'final_parent' count: {empty_final_parent_count} ({empty_final_parent_percentage:.2f}%)")
print(f"Non-empty 'final_parent' count: {non_empty_final_parent_count} ({non_empty_final_parent_percentage:.2f}%)\n")
print("Final Parent Counts:")
print(final_parent_counts.iloc[1:50])

# Convert final parent counts to a DataFrame
final_parent_counts_df = final_parent_counts.reset_index()
final_parent_counts_df.columns = ["final_parent", "count"]
CIVIC_ncit_df_finalparent["final_parent_count"] = CIVIC_ncit_df_finalparent["final_parent"].map(final_parent_counts)
final_output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent_updated.csv")
CIVIC_ncit_df_finalparent.to_csv(final_output_filepath, index=False)
num_distinct_final_parents = CIVIC_ncit_df_finalparent["final_parent"].nunique()
print(f"Number of distinct final parents: {num_distinct_final_parents}")

In [None]:
###### Treatment parent categorization
CIVIC_ncit_df_finalparent_treatmentcategory = CIVIC_ncit_df_finalparent.copy()

# Define treatment category mapping (case-sensitive)
treatment_mapping = {
    "Chemotherapy": {"Chemotherapy", "Cytotoxic Chemotherapeutic Agent", "Adjuvant Chemotherapy",
                     "Hydrocarbon", "Hyperthermic Intraperitoneal Chemotherapy","Chemotherapy Regimen or Agent Combination"},
    "Targeted therapy": {"Targeted Therapy Agent", "Signal Transduction Inhibitor", "FGF/VEGF Receptor Tyrosine Kinase Inhibitor, PD173074",
                         "Enzyme Inhibitor", "Angiogenesis Inhibitor", "VEGF/VEGFR Inhibitors","Apoptotic Pathway-targeting Antineoplastic Agent"},
    "Biological": {"Antineoplastic Biological Agent", "Biological Therapy","Biological Agent"},
    "Hormone therapy": {"Hormone Therapy Agent", "Antineoplastic Hormonal/Endocrine Agent", "Hormone Therapy"},
    "Immunotherapy": {"Immunotherapeutic Agent", "Antineoplastic Immunomodulating Agent"},
    "Anti-infective therapy": {"Anti-Infective Agent"},
    "Agent Affecting Nervous System": {"Agent Affecting Nervous System"},
    "Radiation therapy": {"Radiation Therapy","Radiation Ionizing Radiotherapy"}
}

# Function to categorize treatments based on "final_parent"
def categorize_treatment(final_parent):
    if pd.isna(final_parent):
        return "Other therapy"
    for category, parent_values in treatment_mapping.items():
        if final_parent in parent_values:
            return category
    return "Other therapy" 

CIVIC_ncit_df_finalparent_treatmentcategory["parent_treatment_category"] = CIVIC_ncit_df_finalparent_treatmentcategory["final_parent"].apply(categorize_treatment)
output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent_treatmentcategory.csv")
CIVIC_ncit_df_finalparent_treatmentcategory.to_csv(output_filepath, index=False)
print(f"\nUpdated dataset with 'parent_treatment_category' column saved to: {output_filepath}")
treatment_category_counts = CIVIC_ncit_df_finalparent_treatmentcategory["parent_treatment_category"].value_counts()
treatment_category_counts_df = treatment_category_counts.reset_index()
treatment_category_counts_df.columns = ["Treatment Category", "Count"]

print("\n=== Summary of Treatment Category Counts ===")
print(treatment_category_counts_df.to_string(index=False))

In [None]:
# Create bar chart
final_output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent_treatmentcategory.csv")
CIVIC_ncit_df_finalparent_treatmentcategory = pd.read_csv(final_output_filepath)
treatment_category_counts = CIVIC_ncit_df_finalparent_treatmentcategory["parent_treatment_category"].value_counts()
treatment_category_counts_df = treatment_category_counts.reset_index()
treatment_category_counts_df.columns = ["Treatment Category", "Count"]
plt.figure(figsize=(10, 6))
plt.bar(treatment_category_counts_df["Treatment Category"], treatment_category_counts_df["Count"], color="skyblue")
plt.xlabel("Treatment parent category")
plt.ylabel("Count")
plt.title("Distribution of CIVIC treatment parent categories")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
# Count unique occurrences of "final_parent" within "Other therapy" category
other_therapy_counts = (
    CIVIC_ncit_df_finalparent_treatmentcategory[
        CIVIC_ncit_df_finalparent_treatmentcategory["parent_treatment_category"] == "Other therapy"
    ]["final_parent"]
    .value_counts()
    .reset_index()
)
other_therapy_counts.columns = ["final_parent", "Count"]
other_therapy_counts = other_therapy_counts.sort_values(by="Count", ascending=False)
total_other_therapy_count = other_therapy_counts["Count"].sum()
print("\n=== Unique 'final_parent' Values for 'Other therapy' (Sorted by Frequency) ===")
print(other_therapy_counts.to_string(index=False))
print(f"\nTotal occurrences of 'Other therapy' final parents: {total_other_therapy_count}")

In [None]:
###### Define the final parent to investigate

final_parent = "Agent Targeting Cancer Metabolism"
final_output_filepath = os.path.join(output_directory, "CIVIC_ncit_df_finalparent.csv")
CIVIC_ncit_df_finalparent = pd.read_csv(final_output_filepath)
filtered_df = CIVIC_ncit_df_finalparent[CIVIC_ncit_df_finalparent["final_parent"] == final_parent]
if filtered_df.empty:
    print(f"No entries found for final parent: {final_parent}")
else:
    name_list = filtered_df["name"].dropna().tolist()
    print(f"\nEntries for final parent '{final_parent}':")
    for name in name_list:
        print("-", name)

print(CIVIC_ncit_df_finalparent_treatmentcategory)

## =====================================================

# String matching

In [None]:
# Load dataset
treatment_mapping_df = full_df.copy()
print(f"Total number of rows of dataset to process: {len(treatment_mapping_df):,}")

# Ensure CIVIC_therapies dataset is loaded
if "CIVIC_therapies_filtered" in globals():
    CIVIC_therapies_filtered = CIVIC_therapies_filtered.copy()
    print("CIVIC_therapies_filtered loaded from globals")
else:
    CIVIC_therapies_filtered = pd.read_csv("CIVIC_therapies_filtered.csv")
    print("CIVIC_therapies loaded from files")
print(f"Total number of CIVIC therapies: {len(CIVIC_therapies_filtered):,}")

In [None]:
# Mapping cancer therapy names and aliases onto scientific papers to detect mentions
# then summarizing how often therapies are found in the papers

start_time = time.time()
CIVIC_therapies_filtered["therapyAliases"] = CIVIC_therapies_filtered["therapyAliases"].apply(lambda x: eval(x) if isinstance(x, str) else x)
treatment_mapping_df["PaperTitle"] = treatment_mapping_df["PaperTitle"].astype(str).str.lower()
treatment_mapping_df["Abstract"] = treatment_mapping_df["Abstract"].astype(str).str.lower()
def find_matched_treatments(text, therapy_name, aliases):
    matches = []
    therapy_pattern = re.compile(rf"\b{re.escape(therapy_name.lower())}\b", re.IGNORECASE)
    alias_patterns = {alias: re.compile(rf"\b{re.escape(alias.lower())}\b", re.IGNORECASE) for alias in aliases}
    if therapy_pattern.search(text): matches.append(therapy_name)
    for alias, pattern in alias_patterns.items():
        if pattern.search(text): matches.append(alias)
    return matches if matches else None

original_columns = set(treatment_mapping_df.columns)
new_columns_dict = {}
treatment_matches_list = []
for index, row in tqdm(CIVIC_therapies_filtered.iterrows(), total=len(CIVIC_therapies_filtered), desc="Processing Therapies"):
    therapy_name = row["name"]
    aliases = row["therapyAliases"] if isinstance(row["therapyAliases"], list) else []
    matches = treatment_mapping_df.apply(lambda x: find_matched_treatments(x["PaperTitle"] + " " + x["Abstract"], therapy_name, aliases), axis=1)
    new_columns_dict[therapy_name] = matches.apply(lambda x: 1 if x else 0)
    treatment_matches_list.append(matches)
    
new_columns_df = pd.DataFrame(new_columns_dict)
treatment_mapping_df = pd.concat([treatment_mapping_df, new_columns_df], axis=1)
treatment_mapping_df["Treatment_matches"] = pd.concat(treatment_matches_list, axis=1).apply(lambda row: [match for match in row if match is not None], axis=1)
treatment_mapping_df["Treatment_matches"] = treatment_mapping_df["Treatment_matches"].apply(lambda x: x if len(x) > 0 else None)
start_index = treatment_mapping_df.columns.get_loc("Sum_Gene_Mentions") + 1 if "Sum_Gene_Mentions" in treatment_mapping_df.columns else len(original_columns)
end_index = treatment_mapping_df.columns.get_loc("Treatment_matches")
therapy_columns = treatment_mapping_df.iloc[:, start_index:end_index].select_dtypes(include=['number']).columns
treatment_mapping_df["Sum_treatments"] = treatment_mapping_df[therapy_columns].sum(axis=1)
output_file_path = os.path.join(output_directory, "treatment_mapping_with_matches.csv")
treatment_mapping_df.to_csv(output_file_path, index=False)
print(f"File successfully saved at: {output_file_path}")

end_time = time.time()
total_time = end_time - start_time
print(f"Total runtime: {total_time:.2f} seconds")
runtime_log_path = os.path.join(output_directory, "running_time_treatment_mapping.txt")
with open(runtime_log_path, "w") as f: f.write(f"Total execution time: {total_time:.2f} seconds\n")
num_zero_treatments = (treatment_mapping_df["Sum_treatments"] == 0).sum()
num_nonzero_treatments = (treatment_mapping_df["Sum_treatments"] >= 1).sum()

print("\n===== Summary =====")
print(f"Rows with Sum_treatments == 0: {num_zero_treatments:,}")
print(f"Rows with Sum_treatments >= 1: {num_nonzero_treatments:,}")
print(f"Total sums: {(num_zero_treatments + num_nonzero_treatments):,}")
print(f"Len of dataset: {len(treatment_mapping_df):,}")

In [None]:
# Summary of results
num_zero_treatments = (treatment_mapping_df["Sum_treatments"] == 0).sum()
num_nonzero_treatments = (treatment_mapping_df["Sum_treatments"] >= 1).sum()

print("\n===== Summary =====")
print(f"Rows with Sum_treatments == 0: {num_zero_treatments:,}")
print(f"Rows with Sum_treatments >= 1: {num_nonzero_treatments:,}")
print(f"Total sums: {(num_zero_treatments + num_nonzero_treatments):,}")
print(f"Len of dataset: {len(treatment_mapping_df):,}")

In [None]:
#### Create the summary string
# Calculate percentages
total_rows=len(treatment_mapping_df)
zero_treatment_percentage = (num_zero_treatments / total_rows) * 100
nonzero_treatment_percentage = (num_nonzero_treatments / total_rows) * 100

# Create the summary string
summary = (
    "\n===== Summary =====\n"
    f"Rows with Sum_treatments == 0, i.e., no treatments: {num_zero_treatments:,} ({zero_treatment_percentage:.2f}%)\n"
    f"Rows with Sum_treatments >= 1, i.e., detected treatments: {num_nonzero_treatments:,} ({nonzero_treatment_percentage:.2f}%)\n"
    f"Total sums: {num_zero_treatments + num_nonzero_treatments:,}\n"
    f"Len of dataset: {total_rows:,}\n"
)
filename_therapy_categorization = "summary_runtime_therapy_categorization.txt"
with open(filename_therapy_categorization, "w") as file:
    file.write(summary)
with open(filename_therapy_categorization, "r") as file:
    print(file.read())

In [None]:
treatment_mapping = "treatment_mapping_with_matches.csv"
treatment_mapping_df = pd.read_csv(treatment_mapping)
# Filter rows where Sum_treatments >= 1
filtered_treatment_mapping_df = treatment_mapping_df[treatment_mapping_df["Sum_treatments"] >= 1]
# Count non-zero treatments
num_nonzero_treatments = len(filtered_treatment_mapping_df)
print(f"\nNumber of rows where 'Sum_treatments' >= 1: {num_nonzero_treatments:,}")

print(filtered_treatment_mapping_df[["PaperId","PaperTitle", "Abstract", "Cisplatin", "Sum_treatments", "Treatment_matches"]])

filtered_csv_name = "filtered_treatment_mapping_with_matches.csv"
filtered_treatment_mapping_df.to_csv(filtered_csv_name, index=False)
print(f"\nFiltered DataFrame saved as '{filtered_csv_name}'")

In [None]:
print("\nNumber of articles in the initial dataset: {:,}".format(len(full_df)))
print("Number of columns of initial dataset: {:,}".format(len(full_df.columns)))
columns_list = full_df.columns.tolist()
print(columns_list)

print("\nNumber of articles in the mapped dataset:{:,}".format(len(treatment_mapping_df)))
print("Number of columns of mapped dataset:{:,}".format(len(treatment_mapping_df.columns)))
columns_list_mapped = treatment_mapping_df.columns.tolist()
print(columns_list_mapped)

#### Total treatments
total_treatments=len(treatment_mapping_df.columns)-len(full_df.columns)-2 #Treatment_matches', 'Sum_treatments'
# Calculate treatments with at least 1 match!!
total_treatments = list(set(treatment_mapping_df.columns) - set(full_df.columns))
valid_treatment_cols = [col for col in total_treatments if col in treatment_mapping_df.columns and treatment_mapping_df[col].dtype in [int, float]]
valid_treatments = sum(treatment_mapping_df[valid_treatment_cols].sum() >= 1)

print(f"\nNumber of treatment columns with at least a sum of 1: {valid_treatments:,}")
print("\nNumber of total treatments from CIVIC", total_treatments) #Removed Lysine etc.
print("Total number of treaments in CIVIC database: 565")
print("total number of treatments in publications (at least one match):",valid_treatments)

## Figure creation

In [None]:
# Summarize and export the top 30 most mentioned treatments
if "Sum_Gene_Mentions" in treatment_mapping_df.columns:
    start_index = treatment_mapping_df.columns.get_loc("Sum_Gene_Mentions") + 1
else:
    start_index = 0
end_index = treatment_mapping_df.columns.get_loc("Treatment_matches")
treatment_counts = treatment_mapping_df.iloc[:, start_index:end_index].sum().sort_values(ascending=False)
top_30_treatments = treatment_counts.head(30)
total_treatment_mentions = treatment_counts.sum()
treatment_summary = pd.DataFrame({"Treatment": top_30_treatments.index, "Count": top_30_treatments.values, "Percentage": (top_30_treatments.values / total_treatment_mentions * 100).round(2)})
print("\nTop 30 most mentioned treatments:")
print(treatment_summary.to_string(index=False, justify='left'))
treatment_summary.to_csv(f"{output_directory}/Top_30_treatments_summary.csv", index=False)

In [None]:
# Plot the top 30 most mentioned treatments as a bar chart
if "Sum_Gene_Mentions" in treatment_mapping_df.columns:
    start_index = treatment_mapping_df.columns.get_loc("Sum_Gene_Mentions") + 1
else:
    start_index = 0
end_index = treatment_mapping_df.columns.get_loc("Treatment_matches")
treatment_counts = treatment_mapping_df.iloc[:, start_index:end_index].sum().sort_values(ascending=False)
top_20_treatments = treatment_counts.head(30)
colors = plt.cm.Blues(np.linspace(1, 0.5, len(top_20_treatments)))
formatted_labels = [name.capitalize() for name in top_20_treatments.index]
plt.figure(figsize=(12, 6))
bars = plt.bar(formatted_labels, top_20_treatments.values, color=colors, edgecolor='black')
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel("Treatment name", fontsize=12)
plt.ylabel("Number of mentions", fontsize=12)
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x):,}"))
total_rows = len(treatment_mapping_df)
plt.title(f"Top 30 most mentioned treatments in {num_nonzero_treatments:,} publications", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Summarize mentions per parent treatment category
metadata_columns = ["PaperTitle", "Abstract", "Sum_treatments", "Treatment_matches"]
therapy_columns = [col for col in treatment_mapping_df.columns if col not in metadata_columns]
treatment_to_parent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["parent_treatment_category"].to_dict()
parent_counts = {}
for treatment in therapy_columns:
    if treatment in treatment_to_parent_mapping:
        parent_category = treatment_to_parent_mapping[treatment]
        treatment_sum = treatment_mapping_df[treatment].sum()
        if parent_category in parent_counts:
            parent_counts[parent_category] += treatment_sum
        else:
            parent_counts[parent_category] = treatment_sum
            
parent_counts_df = pd.DataFrame(list(parent_counts.items()), columns=["Parent Category", "Total Mentions"])
total_mentions = parent_counts_df["Total Mentions"].sum()
parent_counts_df["Percentage"] = (parent_counts_df["Total Mentions"] / total_mentions * 100).round(2)
parent_counts_df = parent_counts_df.sort_values(by="Total Mentions", ascending=False)

print("\nTotal mentions per parent treatment category:")
print(parent_counts_df.to_string(index=False, justify='left'))

In [None]:
# Plot total mentions per parent treatment category
metadata_columns = ["PaperTitle", "Abstract", "Sum_treatments", "Treatment_matches"]
therapy_columns = [col for col in treatment_mapping_df.columns if col not in metadata_columns]
treatment_to_parent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["parent_treatment_category"].to_dict()
parent_counts = {}
for treatment in therapy_columns:
    if treatment in treatment_to_parent_mapping:
        parent_category = treatment_to_parent_mapping[treatment]
        treatment_sum = treatment_mapping_df[treatment].sum()
        parent_counts[parent_category] = parent_counts.get(parent_category, 0) + treatment_sum
        
parent_counts_df = pd.DataFrame(list(parent_counts.items()), columns=["Parent Category", "Total Mentions"]).sort_values(by="Total Mentions", ascending=False)
plt.figure(figsize=(12, 6))
colors = plt.cm.Blues(np.linspace(1, 0.5, len(parent_counts_df)))
bars = plt.bar(parent_counts_df["Parent Category"], parent_counts_df["Total Mentions"], color=colors, edgecolor="black")
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel("Parent treatment category", fontsize=12)
plt.ylabel("Number of mentions", fontsize=12)
plt.title("Total mentions per parent treatment category", fontsize=14)
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x):,}"))
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Plot top 20 most mentioned final parent treatment categories
treatment_mapping_df = pd.read_csv("treatment_mapping_with_matches.csv")
CIVIC_ncit_df_finalparent_treatmentcategory = pd.read_csv("CIVIC_ncit_df_finalparent_treatmentcategory.csv")
metadata_columns = ["PaperTitle", "Abstract", "Sum_treatments", "Treatment_matches"]
therapy_columns = [col for col in treatment_mapping_df.columns if col not in metadata_columns]
treatment_to_finalparent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["final_parent"].to_dict()
final_parent_counts = {}
for treatment in therapy_columns:
    if treatment in treatment_to_finalparent_mapping:
        final_parent_category = treatment_to_finalparent_mapping[treatment]
        treatment_sum = treatment_mapping_df[treatment].sum()
        final_parent_counts[final_parent_category] = final_parent_counts.get(final_parent_category, 0) + treatment_sum

        final_parent_counts_df = pd.DataFrame(list(final_parent_counts.items()), columns=["Final Parent Category", "Total Mentions"]).sort_values(by="Total Mentions", ascending=False).head(20)
plt.figure(figsize=(12, 6))
colors = plt.cm.Greens(np.linspace(1, 0.5, len(final_parent_counts_df)))
bars = plt.bar(final_parent_counts_df["Final Parent Category"], final_parent_counts_df["Total Mentions"], color=colors, edgecolor="black")
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel("Final Parent Category", fontsize=12)
plt.ylabel("Number of mentions", fontsize=12)
plt.title("Top 20 most mentioned parents", fontsize=14)
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x):,}"))
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Plot sunburst chart for parent and final parent treatment categories
metadata_columns = ["PaperTitle", "Abstract", "Sum_treatments", "Treatment_matches"]
therapy_columns = [col for col in treatment_mapping_df.columns if col not in metadata_columns]
treatment_mapping_df[therapy_columns] = treatment_mapping_df[therapy_columns].apply(pd.to_numeric, errors='coerce').fillna(0)
treatment_to_parent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["parent_treatment_category"].to_dict()
treatment_to_finalparent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["final_parent"].to_dict()
hierarchy_data = []
for treatment in therapy_columns:
    treatment_sum = treatment_mapping_df[treatment].sum()
    if treatment in treatment_to_parent_mapping and treatment in treatment_to_finalparent_mapping:
        hierarchy_data.append([treatment_to_parent_mapping[treatment], treatment_to_finalparent_mapping[treatment], treatment_sum])

        hierarchy_df = pd.DataFrame(hierarchy_data, columns=["Parent Category", "Final Parent Category", "Total Mentions"])
hierarchy_df["Parent Category"] = hierarchy_df["Parent Category"].astype(str)
hierarchy_df["Final Parent Category"] = hierarchy_df["Final Parent Category"].astype(str)
hierarchy_df = hierarchy_df.groupby(["Parent Category", "Final Parent Category"]).sum().reset_index()
fig = px.sunburst(hierarchy_df, path=["Parent Category", "Final Parent Category"], values="Total Mentions", color="Parent Category", title="Hierarchy of Treatment Categories and Final Parent Categories")
fig.show()

In [None]:
# Plot sunburst chart for parent and final parent treatment categories with mention counts
metadata_columns = ["PaperTitle", "Abstract", "Sum_treatments", "Treatment_matches"]
therapy_columns = [col for col in treatment_mapping_df.columns if col not in metadata_columns]
treatment_mapping_df[therapy_columns] = treatment_mapping_df[therapy_columns].apply(pd.to_numeric, errors='coerce').fillna(0)
treatment_to_parent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["parent_treatment_category"].to_dict()
treatment_to_finalparent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["final_parent"].to_dict()
hierarchy_data = []
for treatment in therapy_columns:
    treatment_sum = treatment_mapping_df[treatment].sum()
    if treatment in treatment_to_parent_mapping and treatment in treatment_to_finalparent_mapping:
        hierarchy_data.append([treatment_to_parent_mapping[treatment], treatment_to_finalparent_mapping[treatment], treatment_sum])
hierarchy_df = pd.DataFrame(hierarchy_data, columns=["Parent Category", "Final Parent Category", "Total Mentions"])
hierarchy_df["Parent Category"] = hierarchy_df["Parent Category"].astype(str)
hierarchy_df["Final Parent Category"] = hierarchy_df["Final Parent Category"].astype(str)
hierarchy_df = hierarchy_df.groupby(["Parent Category", "Final Parent Category"]).sum().reset_index()
hierarchy_df["Final Parent Category Label"] = hierarchy_df["Final Parent Category"] + " (" + hierarchy_df["Total Mentions"].apply(lambda x: f"{int(x):,}") + ")"
fig = px.sunburst(hierarchy_df, path=["Parent Category", "Final Parent Category Label"], values="Total Mentions", color="Parent Category", title="Hierarchy of Treatment Categories and Final Parent Categories")
fig.show()

In [None]:
# Plot treemap
fig = px.treemap(
    hierarchy_df,
    path=["Parent Category", "Final Parent Category"],
    values="Total Mentions",
    color="Parent Category",
    title="Treemap of Treatment Categories and Final Parent Categories"
)
fig.show()

In [None]:
print(CIVIC_ncit_df_finalparent_treatmentcategory.head(50))

In [None]:
# Plot sunburst, treemap, and icicle charts for treatment hierarchy with total mentions
treatment_mapping_df = pd.read_csv("treatment_mapping_with_matches.csv")
CIVIC_ncit_df_finalparent_treatmentcategory = pd.read_csv("CIVIC_ncit_df_finalparent_treatmentcategory.csv")
metadata_columns = ["PaperTitle", "Abstract", "Sum_treatments", "Treatment_matches"]
therapy_columns = [col for col in treatment_mapping_df.columns if col not in metadata_columns]
treatment_mapping_df[therapy_columns] = treatment_mapping_df[therapy_columns].apply(pd.to_numeric, errors='coerce').fillna(0)
treatment_to_parent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["parent_treatment_category"].to_dict()
treatment_to_finalparent_mapping = CIVIC_ncit_df_finalparent_treatmentcategory.set_index("name")["final_parent"].to_dict()
hierarchy_data = []
for treatment in therapy_columns:
    treatment_sum = treatment_mapping_df[treatment].sum()
    if treatment in treatment_to_parent_mapping and treatment in treatment_to_finalparent_mapping:
        hierarchy_data.append([treatment_to_parent_mapping[treatment], treatment_to_finalparent_mapping[treatment], treatment_sum])
hierarchy_df = pd.DataFrame(hierarchy_data, columns=["Parent Category", "Final Parent Category", "Total Mentions"])
hierarchy_df = hierarchy_df.groupby(["Parent Category", "Final Parent Category"]).sum().reset_index()
hierarchy_df["Parent Category"] = hierarchy_df["Parent Category"] + " (" + hierarchy_df["Total Mentions"].astype(int).astype(str) + ")"
hierarchy_df["Final Parent Category"] = hierarchy_df["Final Parent Category"] + " (" + hierarchy_df["Total Mentions"].astype(int).astype(str) + ")"

# *** Sunburst Chart
fig_sunburst = px.sunburst(hierarchy_df, path=["Parent Category", "Final Parent Category"], values="Total Mentions", color="Parent Category", title="Sunburst Chart: Treatment Categories and Final Parent Categories")
fig_sunburst.show()

# *** Treemap
fig_treemap = px.treemap(hierarchy_df, path=["Parent Category", "Final Parent Category"], values="Total Mentions", color="Parent Category", title="Treemap: Treatment Categories and Final Parent Categories")
fig_treemap.show()

# *** Icicle Plot
fig_icicle = px.icicle(hierarchy_df, path=["Parent Category", "Final Parent Category"], values="Total Mentions", color="Parent Category", title="Icicle Plot: Treatment Categories and Final Parent Categories")
fig_icicle.show()

# Merging and hierachy creation

# ==========================================================

# Evaluation: Previous methods to extarct treamtents and drugs from PaperTitle and Abstarct

In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz 
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz 
print("Success!")

In [None]:
nlp_treatments = spacy.load("en_ner_bionlp13cg_md")
print("Success!")
try:
    nlp_treatments = spacy.load("en_ner_bionlp13cg_md")
    print("SciSpaCy Model 'en_ner_bionlp13cg_md' loaded successfully!")
except Exception as e:
    print(f"Error loading SciSpaCy model: {e}")

In [None]:
# Extract drug mentions from text using binary string matching

tqdm.pandas(desc="Extracting drug mentions")
CIVIC_therapies_analysis["name"] = CIVIC_therapies_analysis["name"].str.lower().str.strip()
CIVIC_therapies_analysis["therapyAliases"] = CIVIC_therapies_analysis["therapyAliases"].apply(lambda x: [alias.strip().lower() for alias in eval(x)] if isinstance(x, str) and x.startswith("[") else [])
drug_mapping = {}
for _, row in CIVIC_therapies_analysis.iterrows():
    drug_name = row["name"]
    aliases = row["therapyAliases"]
    drug_mapping[drug_name] = set([drug_name] + aliases)
all_drug_terms = set()
for aliases in drug_mapping.values():
    all_drug_terms.update(aliases)
all_drug_terms = list(all_drug_terms)
def extract_drugs_binary(text):
    if pd.isna(text) or text.strip() == "":
        return {drug: 0 for drug in all_drug_terms}
    text = text.lower()
    matches = {drug: 1 if drug in text else 0 for drug in all_drug_terms}
    return matches

df['Title_Abstract'] = df['PaperTitle'].astype(str) + " " + df['Abstract'].astype(str)
start_time_expected = time.time()
test_row = df['Title_Abstract'].iloc[0]
test_time_start = time.time()
extract_drugs_binary(test_row)
test_time_end = time.time()
single_row_time = test_time_end - test_time_start
expected_runtime_seconds = single_row_time * total_rows
print(f"Expected runtime: {expected_runtime_seconds:.2f} seconds (~{expected_runtime_seconds/60:.2f} minutes)")
start_time_execution = time.time()
binary_matrix = df['Title_Abstract'].progress_apply(extract_drugs_binary).apply(pd.Series)
binary_matrix["Sum_drug_mentions"] = binary_matrix.sum(axis=1)
df = pd.concat([df, binary_matrix], axis=1)
rows_with_drug_mentions = (df["Sum_drug_mentions"] >= 1).sum()
rows_with_no_mentions = (df["Sum_drug_mentions"] == 0).sum()
percentage_with_drugs = (rows_with_drug_mentions / total_rows) * 100
output_path = "Drug_Binary_Matrix_String_Matching.csv"
df.to_csv(output_path, index=False)
summary_text = f"""
Total rows in dataset: {total_rows}
Rows with at least one drug mention: {rows_with_drug_mentions}
Rows with no drug mentions: {rows_with_no_mentions}
Percentage of articles with drug mentions: {percentage_with_drugs:.2f}%
"""
summary_output_path = "Drug_Matching_Summary.txt"
with open(summary_output_path, "w") as f:
    f.write(summary_text)
end_time_execution = time.time()
actual_runtime_seconds = end_time_execution - start_time_execution


print(summary_text)
print(f"Processing completed successfully! Results saved to {output_path} and summary saved to {summary_output_path}")
print(f"Actual runtime: {actual_runtime_seconds:.2f} seconds (~{actual_runtime_seconds/60:.2f} minutes)")

## Extract therapies mentions using SSciSpaCy "en_ner_bionlp13cg_md" and "en_ner_bc5cdr_md"

In [None]:
# Load SciSpaCy models for drug extraction
try:
    nlp_drug_1 = spacy.load("en_ner_bc5cdr_md")
    nlp_drug_2 = spacy.load("en_ner_bionlp13cg_md") 
    print("SciSpaCy Models Loaded Successfully!")
except Exception as e:
    print(f"Error Loading Models: {e}")
print("Success!")

In [None]:
# Stopwords to filter out false positives
EXCLUDE_TERMS = {"gene", "HRR","DDR",}

# Function to clean extracted terms
def clean_term(term):
    """Cleans extracted terms by normalizing Unicode and standardizing hyphens."""
    term = term.lower().strip()
    term = unicodedata.normalize("NFKC", term)  # Normalize Unicode characters
    term = re.sub(r'[-‐–—]', ' ', term)  # Standardize hyphens
    return term
print("Success!")

In [None]:
def extract_drug_mentions(text):
    """
    Extracts drug-related mentions from the text using two SciSpaCy models.
    """
    if pd.isna(text) or text.strip() == "":
        return []
    extracted_terms = set() 
    doc1 = nlp_drug_1(text)
    doc2 = nlp_drug_2(text)

    for ent in doc1.ents + doc2.ents:
        term = clean_term(ent.text)
        if ent.label_ == "CHEMICAL":
            extracted_terms.add(term)

    # Remove false positives
    filtered_terms = {term for term in extracted_terms if term not in EXCLUDE_TERMS}

    return list(filtered_terms)

def apply_drug_extraction(df):
    """
    Apply drug extraction to the 'PaperTitle' and 'Abstract' columns using swifter for faster execution.
    """
    print(f"Starting extraction for {len(df)} rows")
    df["Extracted_Drugs_Chemicals"] = (
        df["PaperTitle"].astype(str) + " " + df["Abstract"].astype(str)
    ).swifter.apply(extract_drug_mentions)
    print(f"Extraction complete for {len(df)} rows")
    return df
print("Success!")

In [None]:
# Extract drugs and therapies from dataset and save results
start_time = time.time()
tqdm.pandas()
df_drug_extraction = full_df.copy()
print(f"Length of dataset: {len(df_drug_extraction)}")
df_drug_extraction = apply_drug_extraction(df_drug_extraction)
end_time = time.time()
runtime = end_time - start_time
output_path = os.path.join(output_directory, "Extracted_Drug_Chemical_Terms.csv")
df_drug_extraction.to_csv(output_path, index=False)
print(f"Length of dataset after extraction: {len(df_drug_extraction)}")
runtime_log_path = os.path.join(output_directory, "running_time_drug_extraction.txt")
with open(runtime_log_path, "w") as f:
    f.write(f"Total execution time: {runtime:.2f} seconds\n")

print(f"File saved successfully: {output_path}")
print(f"Execution time logged in: {runtime_log_path}")

In [None]:
# Print the specific columns from df_drug_extraction
print(len(df_drug_extraction))
print(df_drug_extraction.columns)
columns_to_display = ["PaperId", "PaperTitle", "Abstract", "Extracted_Drugs_Chemicals"]
print(df_drug_extraction[columns_to_display])

# =================================================
## Using BERT for therapy and drug extraction:
## alvaroalon2/biobert_chemical_ner"

In [None]:
!pip install transformers torch
print("Success!")
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
print("Success!")

model_name = "alvaroalon2/biobert_chemical_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

In [None]:
def extract_drugs(text):
    if pd.isna(text) or not text.strip():
        return []
    # Tokenize and encode the text with truncation
    encoded_input = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
    # Apply the NER pipeline to the text
    entities = nlp(text)
    # Filter entities labeled as 'CHEMICAL'
    drugs = [entity['word'] for entity in entities if entity['entity_group'] == 'CHEMICAL']
    return list(set(drugs)) 
print("Success!")

In [None]:
# Extract drug names using BioBERT model
df = full_df.copy()
tqdm.pandas(desc="Extracting drug names")
df['Title_Abstract'] = df['PaperTitle'].astype(str) + " " + df['Abstract'].astype(str)
df['Extracted_Drugs'] = df['Title_Abstract'].progress_apply(extract_drugs)
print("Extraction completed successfully!")
print(df)

output_path = os.path.join(output_directory, "Extracted_Drugs_BioBERT.csv")
df.to_csv(output_path, index=False)
print(f"File saved successfully: {output_path}")

# ============================================================
## Using BERT for therapy and drug extraction:
## allenai/biomed_roberta_base"

In [None]:
!pip install torch transformers pandas tqdm
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from tqdm import tqdm
model_name = "allenai/biomed_roberta_base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
print("Success!")

In [None]:
def extract_drugs(text):
    if pd.isna(text) or text.strip() == "":
        return []

    # Apply the NER pipeline to the text
    entities = nlp(text)

    # Filter entities labeled as 'CHEMICAL' or 'DRUG'
    drugs = [entity['word'] for entity in entities if entity['entity_group'] in ['CHEMICAL', 'DRUG']]

    return list(set(drugs))  # Remove duplicates
print("Success!")

In [None]:
# Make a copy of the original DataFrame
df = full_df.copy()

# Ensure tqdm works properly
tqdm.pandas(desc="Extracting drug names")

# Create a combined text field for processing (PaperTitle + Abstract)
df['Title_Abstract'] = df['PaperTitle'].astype(str) + " " + df['Abstract'].astype(str)

# Apply the extract_drugs function to the combined text field
df['Extracted_Drugs'] = df['Title_Abstract'].progress_apply(extract_drugs)
print("Extraction completed successfully!")

output_path = "Extracted_Drugs_biomed_roberta_base.csv"
df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

# ============================================================
## Improve chemical NER BERT for therapy and drug extraction:
## 

In [None]:
# Extract drug names from text using BioBERT model alvaroalon2/biobert_chemical_ner
model_name = "alvaroalon2/biobert_chemical_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def split_text_into_chunks(text, max_length=512):
    tokens = tokenizer.tokenize(text)
    return [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]
def extract_drugs(text):
    if pd.isna(text) or text.strip() == "":
        return []
    entities = nlp(text)
    drugs = []
    for entity in entities:
        word = entity['word']
        if word.startswith('##'):
            word = word[2:]
        elif word.startswith('r') and len(word) > 1:
            word = word[0] + word[1:]
        if entity['entity_group'] == 'CHEMICAL' and word not in drugs:
            drugs.append(word)
    return drugs
df = full_df.copy()
tqdm.pandas(desc="Extracting drug names")
df['Title_Abstract'] = df['PaperTitle'].astype(str) + " " + df['Abstract'].astype(str)
df['Extracted_Drugs'] = df['Title_Abstract'].progress_apply(extract_drugs)
output_path = "Extracted_Drug_Names.csv"
df.to_csv(output_path, index=False)

print(f"Extraction completed successfully! Results saved to {output_path}")
print(df)

# =========================================================

# Match with CIVIC for binary matrix creation

In [None]:
# Load the CIVIC therapy dataset
# Check if the 'CIVIC_therapies' variable already exists in the global environment
if "CIVIC_therapies" in globals():
    CIVIC_therapies_analysis = CIVIC_therapies.copy()
else:
    CIVIC_therapies_analysis = pd.read_csv("/path/to/CIVIC_therapies_with_aliases.csv")

print(CIVIC_therapies_analysis.head())
print("Length of dataset:",len(CIVIC_therapies_analysis))
print("\n\nSuccess!")

In [None]:
# Check and load dataset
if "df_drug_extraction" in globals():
    df_drug_extraction = df_drug_extraction.copy()
else:
    print("Loading dataset from file...")
    df_drug_extraction = pd.read_csv(output_directory + "Extracted_Drug_Chemical_Terms.csv")
print(f"Length of dataset copy: {len(df_drug_extraction):,}")

# Ensure that the 'Extracted_Drugs_Chemicals' column is processed as a list of strings
df_drug_extraction["Extracted_Drugs_Chemicals"] = df_drug_extraction["Extracted_Drugs_Chemicals"].apply(
    lambda x: x if isinstance(x, list) else ast.literal_eval(x) 
)

print(df_drug_extraction[["PaperId", "Extracted_Drugs_Chemicals"]].head())
print("Succes!")

In [None]:
# Normalize drug names for consistent matching
def normalize_drug_name(term):
    if not isinstance(term, str):
        return term
    term = term.lower().strip()
    term = unicodedata.normalize("NFKC", term)
    term = re.sub(r'[-‐–—]', ' ', term)
    ignore_prefixes = ["drug", "treatment", "therapy", "chemotherapy", "medication", "cancer"]
    term = re.sub(r'\b(' + '|'.join(ignore_prefixes) + r')\b', '', term)
    term = re.sub(r'[\.\d/]+$', '', term)
    term = re.sub(r'\s+', ' ', term).strip()
    words = term.split()
    term = ' '.join(sorted(set(words), key=words.index))
    return term

print("Success!")

In [None]:
# Map extracted drug terms to known therapies and save results
no_match_count = 0
open("unmatched_therapy_terms_log.txt", "w").close()
def map_therapy_terms(terms):
    global no_match_count
    if not isinstance(terms, list):
        return []
    mapped_therapies = []
    unmatched_therapies = set()
    for term in terms:
        normalized_term = normalize_drug_name(term).lower()
        matched_therapy = next((therapy for therapy in therapy_mapping if normalized_term in therapy.lower()), None)
        if matched_therapy:
            mapped_therapies.append(matched_therapy)
        else:
            no_match_count += 1
            unmatched_therapies.add(normalized_term)
    if unmatched_therapies:
        with open("unmatched_therapy_terms_log.txt", "a") as f:
            for term in unmatched_therapies:
                f.write(term + "\n")
    return list(set(mapped_therapies))
df_drug_extraction["Mapped_Therapies"] = df_drug_extraction["Extracted_Drugs_Chemicals"].progress_apply(map_therapy_terms)

print(df_drug_extraction[["PaperId", "Extracted_Drugs_Chemicals", "Mapped_Therapies"]].head(10))
df_drug_extraction.to_csv(os.path.join(output_directory, "Extracted_Drug_Chemical_Terms_with_mapped_therapies_1.csv"), index=False)

In [None]:
# Read
with open("unmatched_therapy_terms_log.txt", "r") as f:
    unmatched_therapy_terms_log = f.readlines()
print(unmatched_therapy_terms_log[:20])  # Show first 20 unmatched terms

In [None]:
# Map extracted drug terms to therapy names and aliases
no_match_count = 0
open("unmatched_therapy_terms_log.txt", "w").close()
def map_therapy_terms(terms):
    global no_match_count
    if not isinstance(terms, list):
        return []
    mapped_therapies = []
    unmatched_therapies = set()
    for term in terms:
        normalized_term = normalize_drug_name(term).lower()
        matched = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if normalized_term in normalize_drug_name(therapy.name).lower() or any(normalized_term in normalize_drug_name(alias).lower() for alias in (therapy.therapyAliases if isinstance(therapy.therapyAliases, list) else []))), None)
        if matched:
            mapped_therapies.append(matched.name)
        else:
            no_match_count += 1
            unmatched_therapies.add(normalized_term)
    if unmatched_therapies:
        with open("unmatched_therapy_terms_log.txt", "a") as f:
            f.writelines(term + "\n" for term in unmatched_therapies)
    return list(set(mapped_therapies))
df_drug_extraction["Mapped_Therapies"] = df_drug_extraction["Extracted_Drugs_Chemicals"].progress_apply(map_therapy_terms)

print(df_drug_extraction[["PaperId", "Extracted_Drugs_Chemicals", "Mapped_Therapies"]].head(10))
df_drug_extraction.to_csv(os.path.join(output_directory, "Extracted_Drug_Chemical_Terms_with_mapped_therapies_2.csv"), index=False)

In [None]:
# Read
with open("unmatched_therapy_terms_log.txt", "r") as f:
    unmatched_therapy_terms_log = f.readlines()
print(unmatched_therapy_terms_log[:20])  # Show first 20 unmatched terms

In [None]:
# Map extracted drug terms to therapy names and aliases separately from CIVIC dataset
no_match_count = 0
open("unmatched_therapy_terms_log.txt", "w").close()
def map_therapy_terms(terms):
    global no_match_count
    if not isinstance(terms, list): return []
    mapped_therapies = []
    unmatched_therapies = set()
    for term in terms:
        normalized_term = normalize_drug_name(term).lower()
        matched_therapy = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if normalized_term in normalize_drug_name(therapy.name).lower()), None)
        if not matched_therapy:
            matched_therapy = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if any(normalized_term in normalize_drug_name(alias).lower() for alias in (therapy.therapyAliases if isinstance(therapy.therapyAliases, list) else []))), None)
        if matched_therapy: mapped_therapies.append(matched_therapy.name)
        else:
            no_match_count += 1
            unmatched_therapies.add(normalized_term)
    if unmatched_therapies:
        with open("unmatched_therapy_terms_log.txt", "a") as f: f.writelines(term + "\n" for term in unmatched_therapies)
    return list(set(mapped_therapies))
df_drug_extraction["Mapped_Therapies"] = df_drug_extraction["Extracted_Drugs_Chemicals"].progress_apply(map_therapy_terms)

In [None]:
# Check if the mapping worked
print(df_drug_extraction[["PaperId", "Extracted_Drugs_Chemicals", "Mapped_Therapies"]].head(10))
df_drug_extraction.to_csv(os.path.join(output_directory, "Extracted_Drug_Chemical_Terms_with_mapped_therapies_3.csv"), index=False)

In [None]:
# Normalize drug names and map extracted drug terms to therapies by exact match

def normalize_drug_name(term):
    if not isinstance(term, str): return term
    term = term.lower().strip()
    term = unicodedata.normalize("NFKC", term)
    term = re.sub(r'[-‐–—]', ' ', term)
    term = re.sub(r'\b(' + '|'.join(["drug","treatment","therapy","chemotherapy","medication","cancer"]) + r')\b', '', term)
    term = re.sub(r'[\.\d/]+$', '', term)
    term = re.sub(r'\s+', ' ', term).strip()
    words = term.split()
    return ' '.join(sorted(set(words), key=words.index))
print("Success!")

no_match_count = 0
open("unmatched_therapy_terms_log.txt", "w").close()

def map_therapy_terms(terms):
    global no_match_count
    if not isinstance(terms, list): return []
    mapped_therapies = []
    unmatched_therapies = set()
    for term in terms:
        normalized_term = normalize_drug_name(term).lower()
        matched_therapy = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if normalized_term == normalize_drug_name(therapy.name).lower()), None)
        if not matched_therapy:
            matched_therapy = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if isinstance(therapy.therapyAliases, list) and any(normalized_term == normalize_drug_name(alias).lower() for alias in therapy.therapyAliases)), None)
        if matched_therapy: mapped_therapies.append(matched_therapy.name)
        else:
            no_match_count += 1
            unmatched_therapies.add(normalized_term)
    if unmatched_therapies:
        with open("unmatched_therapy_terms_log.txt", "a") as f: f.writelines(term + "\n" for term in unmatched_therapies)
    return list(set(mapped_therapies))
df_drug_extraction["Mapped_Therapies"] = df_drug_extraction["Extracted_Drugs_Chemicals"].progress_apply(map_therapy_terms)
print(df_drug_extraction[["PaperId", "Extracted_Drugs_Chemicals", "Mapped_Therapies"]].head())
df_drug_extraction.to_csv(os.path.join(output_directory, "Extracted_Drug_Chemical_Terms_with_mapped_therapies_4.csv"), index=False)

In [None]:
# Normalize drug names and map them to therapies using exact matching
def normalize_drug_name(term):
    if not isinstance(term, str): return term
    term = term.lower().strip() # Convert to lowercase
    term = unicodedata.normalize("NFKC", term) # Normalize unicode characters
    term = re.sub(r'[-‐–—_]', ' ', term) # Replace hyphens, dashes, and underscores with spaces
    term = re.sub(r'\b(' + '|'.join(["drug","treatment","therapy","chemotherapy","medication","cancer"]) + r')\b', '', term) # Remove common prefixes and suffixes
    term = re.sub(r'[\.\d/]+$', '', term) # Remove numbers or symbols at end
    term = re.sub(r'\s+', ' ', term).strip() # Remove consecutive spaces and extra spaces
    term = re.sub(r'[^a-z\s]', '', term) # Clean invalid characters
    words = term.split()
    return ' '.join(sorted(set(words), key=words.index)) # Remove consecutive duplicate words
print("Success!")


no_match_count = 0
open("unmatched_therapy_terms_log.txt", "w").close()


def map_therapy_terms(terms):
    global no_match_count
    if not isinstance(terms, list): return []
    mapped_therapies = []
    unmatched_therapies = set()
    for term in terms:
        normalized_term = normalize_drug_name(term).lower()
        matched_therapy = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if normalized_term == normalize_drug_name(therapy.name).lower()), None)
        if not matched_therapy:
            matched_therapy = next((therapy for therapy in CIVIC_therapies_analysis.itertuples() if isinstance(therapy.therapyAliases, list) and any(normalized_term == normalize_drug_name(alias).lower() for alias in therapy.therapyAliases)), None)
        if matched_therapy: mapped_therapies.append(matched_therapy.name)
        else:
            no_match_count += 1
            unmatched_therapies.add(normalized_term)
    if unmatched_therapies:
        with open("unmatched_therapy_terms_log.txt", "a") as f: f.writelines(term + "\n" for term in unmatched_therapies)
    return list(set(mapped_therapies))
df_drug_extraction["Mapped_Therapies"] = df_drug_extraction["Extracted_Drugs_Chemicals"].progress_apply(map_therapy_terms)

print(df_drug_extraction[["PaperId", "Extracted_Drugs_Chemicals", "Mapped_Therapies"]].head())
df_drug_extraction.to_csv(os.path.join(output_directory, "Extracted_Drug_Chemical_Terms_with_mapped_therapies_5.csv"), index=False)

In [None]:
# Generate binary therapy matrix from mapped therapies

binary_rows = [{"PaperId": row["PaperId"], "mapped_therapy": term} for _, row in df_drug_extraction.iterrows() for term in row["Mapped_Therapies"]]
binary_df = pd.DataFrame(binary_rows)
binary_matrix = binary_df.pivot_table(index="PaperId", columns="mapped_therapy", aggfunc=lambda x: 1, fill_value=0)
df_binary_matrix = df_drug_extraction.merge(binary_matrix, on="PaperId", how="left").fillna(0)
df_binary_matrix["Therapy_Sum"] = df_binary_matrix.iloc[:, df_drug_extraction.shape[1]:].sum(axis=1)
df_binary_matrix.to_csv("binary_therapy_matrix.csv", index=False)

print(df_binary_matrix.head())
print(df_binary_matrix.columns)

In [None]:
# Filter papers with mapped therapies and save separate outputs

df_binary_matrix_filtered = df_binary_matrix[df_binary_matrix["Therapy_Sum"] > 0]
zero_rows_therapy_df = df_binary_matrix[df_binary_matrix["Therapy_Sum"] == 0]
df_binary_matrix_filtered.to_csv("binary_therapy_matrix_filtered.csv", index=False)
zero_rows_therapy_df.to_csv("zero_rows_therapy_df.csv", index=False)
print(f"Filtered dataset size (papers with therapies): {len(df_binary_matrix_filtered)} rows")
print(f"Dropped dataset size (papers with no therapies): {len(zero_rows_therapy_df)} rows")

In [None]:
# Filter therapy matrix by Therapy_Sum and verify dataset size

os.chdir(output_directory)
matrix_file = "binary_therapy_matrix_filtered.csv"
df_binary_matrix = pd.read_csv(matrix_file)
df_binary_matrix_filtered = df_binary_matrix[df_binary_matrix["Therapy_Sum"] > 0]
original_length = len(df_binary_matrix)
filtered_length = len(df_binary_matrix_filtered)
dropped_length = original_length - filtered_length
dropped_percentage = (dropped_length / original_length) * 100


print(f"\nOriginal dataset length: {original_length:,}")
print(f"Filtered dataset length (Therapy_Sum > 0): {filtered_length:,}")
print(f"Dropped rows (Therapy_Sum == 0): {dropped_length:,} ({dropped_percentage:.2f}%)")
if original_length == (filtered_length + dropped_length):
    print("\n--> The numbers add up correctly!")
else:
    print("\n--> Warning: The numbers do NOT add up correctly!")