# Normalization of cancers and treatments

# 1) Set up libraries and datasets

## 1.1) Import libraries and models

In [None]:
# Import libraries
import os
import time
import datetime
import re
import json
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import networkx as nx
import community
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram

print("Success!")

## 1.2) Load datasets

In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
variantscape_directory = "VARIANTSCAPE_DIRECTORY"
figure_directory = "FIGURE_DIRECTORY"

os.chdir(variantscape_directory)
print("Current directory:", os.getcwd())

In [None]:
# Load datasets
#### In output directory
os.chdir(output_directory)
CIVIC_file = "CIVIC_cancer_synonyms_cleaned.csv"
CIVIC_cancer_synonyms_df = pd.read_csv(CIVIC_file)
print("- CIVIC cancer synonyms are loaded!")
print(CIVIC_cancer_synonyms_df.head(5))
print("\n","###" * 20, "\n")

## Treatment dataset
columns_to_load = ['name', 'therapyAliases', 'final_parent', 'parent_treatment_category']
treatment_file = "CIVIC_ncit_df_finalparent_treatmentcategory.csv"
treatment_synonyms_df = pd.read_csv(treatment_file, usecols=columns_to_load)
treatment_synonyms_df['therapyAliases'] = treatment_synonyms_df['therapyAliases'].apply(
    lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
)
print("- Treatment synonyms are loaded!")
print(treatment_synonyms_df)
print(len(treatment_synonyms_df))
print("\n","###" * 20, "\n")

#### In variantscape directory
os.chdir(variantscape_directory)
variant_df = pd.read_csv("final_variant_df_for_analysis.csv", low_memory=False)
print("\n\nFull variant dataset to analyze loaded!")
len_variant_df_rows, len_variant_df_cols = variant_df.shape
print(f"\nContains {len_variant_df_rows:,} rows, {len_variant_df_cols:,} columns")
print("\nSuccess!")

# Step 1) Harmonize cancers

In [None]:
# Set of columns to ignore
ignore_columns = {
    'PaperId', 'PaperTitle', 'Citations', 'CoFoS', 'Authors', 'Abstract', 
    'Language', 'PubYear', 'PubDate', 'BioBERT', 'Sum_Gene_Mentions', 
    'Extracted_Cancer_Terms_old', 'Extracted_Cancer_Terms','Cancer_Type_Sum',
    'Mapped_Cancer_Terms', 'Unmatched_Cancer_Terms', 'Remapped_Cancer_Terms', 
    'Final_Mapped_Cancer_Terms', 'Treatment_matches', 'Sum_treatments', 'Study_design',
    'total_variant_count','LLM_Prompt', 'LLM_Response', 'Cleaned_Variant_Gene_Pairs'
}

# Check first row values for 'Cancer' (case-insensitive)
cancer_cols_in_row0 = [
    col for col in variant_df.columns
    if "cancer" in str(variant_df.iloc[0][col]).lower()
    and col not in ignore_columns
]
print(f"Number of relevant 'Cancer' columns (excluding ignored ones): {len(cancer_cols_in_row0)}")
print("Columns list:")
print(cancer_cols_in_row0)

In [None]:
# Ensure final_parent values are lowercase and unique
final_parents_cleaned = set(CIVIC_cancer_synonyms_df["final_parent"].dropna().str.lower().unique())

# Keyword mapping dictionary
keyword_mapping = {
    "skin": "skin cancer",
    "breast": "breast cancer",
    "mammary": "breast cancer",
    "mucinous": "mucinous cancer",
    "lung": "lung cancer", #this is Non-Small Cell Lung Cancer NSCLC!
    "bronchio": "lung cancer",
    "spindle cell": "spindle cell cancer",  
    "acute myeloid leukemia": "acute myeloid leukemia",    
    "salivary gland": "salivary gland cancer",
    "renal": "renal cancer",
    "prostate": "prostate cancer",
    "pancreatic": "pancreatic cancer",
    "medulloblastoma": "medulloblastoma",
    "lymphoblastic leukemia": "lymphoblastic leukemia",
    "myeloid": "myeloid cancer",
    "kidney": "kidney cancer",
    "head and neck": "head and neck cancer",
    "gastrointestinal": "gastrointestinal cancer",
    "neurofibroma": "neurofibroma",
    "ovarian": "ovarian cancer",
    "ovary": "ovarian cancer",
    "supratentorial ependymoma": "supratentorial ependymoma",
    "cervix": "cervix cancer", 
    "cervical": "cervix cancer",
    "colorectal": "colon cancer",
    "colon": "colon cancer",
    "endometri": "endometrial cancer",
    "melano": "melanoma",
    "laryngeal": "laryngeal cancer",
    "glioma": "glioma",
    "bone": "bone cancer",
    "osteo": "bone cancer",
    "peritoneal": "peritoneal cancer",
    "astrocytoma": "astrocytoma",
    "glioblastoma": "glioblastoma",
    "gastric": "gastric cancer",
    "mesothelioma": "mesothelioma",
    "esophag": "esophagus cancer",
    "thyroid": "thyroid cancer",
    "thymus": "thymus cancer",
    "uterus": "uterine cancer",
    "spinal": "spinal cancer",
    "hepatocellular": "liver cancer",
    "cholangio": "cholangio cancer",
    "bile duct": "biliary tract cancer",
    "gliosarcoma": "glioma",
    "myeloid cancer": "hematologic cancer",
    "myeloproliferative cancer": "hematologic cancer",
    "myelodysplastic syndrome": "hematologic cancer",
    "essential thrombocythemia": "hematologic cancer",
    "myelofibrosis": "hematologic cancer",
    "barrett": "esophagus cancer",
    "fraumeni": "li-fraumeni syndrome",
    "liposarcoma": "liposarcoma",
    "papillary": "papillary cancer"
}

# Classifier for leukemia/lymphoma terms
def classify_leukemia_lymphoma(name):
    if isinstance(name, str):
        name_lower = name.lower()
        has_leukemia = any(word in name_lower for word in ["leukemia", "leukemic"])
        has_lymphoma = "lymphoma" in name_lower
        if has_leukemia and has_lymphoma:
            return "leukemia/lymphoma"
        elif has_leukemia:
            return "leukemia"
        elif has_lymphoma:
            return "lymphoma"
    return None

# Classify each column name
def classify_column(col_name):
    col_lower = col_name.lower()
    # 1. Direct match with cleaned final_parents
    if col_lower in final_parents_cleaned:
        return ("final_parent match", col_lower)
    # 2. Keyword mapping
    for keyword, replacement in keyword_mapping.items():
        if re.search(rf"\b{re.escape(keyword)}\b", col_lower):
            return ("keyword match", replacement)
    # 3. Leukemia/Lymphoma classification
    leukemia_lymphoma_result = classify_leukemia_lymphoma(col_lower)
    if leukemia_lymphoma_result:
        return ("leukemia/lymphoma classification", leukemia_lymphoma_result)
    # 4. No match — return original name as value
    return ("not matched", col_name)

# Apply classification
match_info = []
for col in cancer_cols_in_row0:
    match_type, match_val = classify_column(col)
    match_info.append({"column_name": col, "match_type": match_type, "matched_value": match_val})
print(f"\n{'Column Name':<40} | {'Match Type':<30} | Matched Value")
print("-" * 100)
for row in match_info:
    print(f"{row['column_name']:<40} | {row['match_type']:<30} | {row['matched_value']}")

# Count unique matched values (rightmost column)
unique_matched_values = set(row['matched_value'] for row in match_info)
print(f"\nTotal unique matched cancer types: {len(unique_matched_values)}")

# Count how many columns belong to each match type
match_types = ["final_parent match", "keyword match", "leukemia/lymphoma classification", "not matched"]
for match_type in match_types:
    count = sum(1 for row in match_info if row["match_type"] == match_type)
    print(f"\nTotal columns with match type '{match_type}': {count}")

# Calculate the total sum of the last values
total_columns = sum(1 for match_type in match_types for row in match_info if row["match_type"] == match_type)
print(f"\nTotal sum of columns: {total_columns}")

In [None]:
# Inspect all columns before renaming (including cancer and non-cancer columns)
total_columns_before = variant_df.shape[1]
print(f"\nTotal columns in variant_df (before renaming): {total_columns_before}")

column_rename_map = {
    row["column_name"]: row["matched_value"] for row in match_info
}

# Extract cancer-related columns and rename them based on the classification result
cancer_data = variant_df[cancer_cols_in_row0].copy()
cancer_data_renamed = cancer_data.rename(columns=column_rename_map)

# Calculate how many columns in index 0 have the word 'Cancer' excluding ignored columns
columns_with_cancer_in_index0 = [
    col for col in variant_df.columns 
    if 'cancer' in str(variant_df.iloc[0][col]).lower() and col not in ignore_columns
]
print(f"\nNumber of columns in index 0 with 'Cancer' in their names (excluding ignored columns): {len(columns_with_cancer_in_index0)}")

# Merge columns using the logical OR (group by columns) - Fixing deprecation warning
cancer_data_cleaned = cancer_data_renamed.drop(index=0).copy()
cancer_data_cleaned = cancer_data_cleaned.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
merged_cancer_binary_df = cancer_data_cleaned.T.groupby(level=0).max().T

# Count how many cancer columns were merged
total_cancer_columns_after_merge = len(merged_cancer_binary_df.columns)  # Columns after merging
total_cancer_columns_before_merge = len(cancer_data_renamed.columns)  # Columns before merging
columns_merged = total_cancer_columns_before_merge - total_cancer_columns_after_merge
print(f"\nTotal cancer columns merged: {columns_merged}")

# Inspect non-cancer columns (keep them separate)
non_cancer_columns = [col for col in variant_df.columns if col not in cancer_cols_in_row0 and col not in ignore_columns]
print(f"\nTotal non-cancer columns: {len(non_cancer_columns)}")

# Merge the final non-cancer columns with the merged cancer columns (preserving column order)
ignored_columns_in_final = [col for col in ignore_columns if col in variant_df.columns]
ignored_df      = variant_df[ignored_columns_in_final]
non_cancer_df   = variant_df[non_cancer_columns]
cancer_df       = merged_cancer_binary_df
cleaned_df_v1 = pd.concat([ignored_df, non_cancer_df, cancer_df], axis=1)

# Add "Cancer" to index 0 for the merged cancer columns ONLY
cleaned_df_v1.iloc[0] = cleaned_df_v1.iloc[0].astype('object') 
cleaned_df_v1.iloc[0] = cleaned_df_v1.iloc[0].fillna("Cancer")

total_columns_after_merge = cleaned_df_v1.shape[1]
print(f"\nTotal columns after renaming and merging (final merged DataFrame): {total_columns_after_merge}")

columns_removed_by_merging = total_columns_before - total_columns_after_merge
print(f"\nTotal columns REMOVED by merging: {columns_removed_by_merging}")

# Verify the math check
calculated_total_after_merge = total_columns_before - columns_removed_by_merging
print(f"\nCalculated Total columns AFTER renaming and merging: {calculated_total_after_merge}")
print(f"Total columns AFTER renaming and merging (actual): {total_columns_after_merge}")
print(f"Do the values match? {calculated_total_after_merge == total_columns_after_merge}")

cancer_columns_after_merge = [
    col for col in cleaned_df_v1.columns 
    if 'cancer' in col.lower() and col not in ignore_columns
]
print(f"\nTotal 'Cancer' columns after merging (excluding ignored columns): {len(cancer_columns_after_merge)}")

print("\nFinal Dataset (first 5 rows):")
print(cleaned_df_v1.head())

final_dataset_name = "cleaned_df_v1"
print(f"\nThe name of the final dataset after this execution is: {final_dataset_name}")

In [None]:
# Print all column names in cleaned_df_v1 as a list
print(f"Length rows: {len(cleaned_df_v1):,}")
print(f"Length columns: {len(cleaned_df_v1.columns):,}")

print("\nAll column names in cleaned_df_v1:")
print(cleaned_df_v1.columns.tolist())  # List all column names

# Print the values in index 0 (first row) as a list
print("\nValues in index 0 (first row) of cleaned_df_v1:")
print(cleaned_df_v1.iloc[0].tolist())  # List all values in index 0

# Check for empty values in index 0
empty_values_in_index0 = cleaned_df_v1.iloc[0].isna().sum()  # Count of NaN values in index 0
print(f"\nNumber of empty (NaN) values in index 0 (first row): {empty_values_in_index0}")

# Print a list of columns where index 0 has NaN values
columns_with_empty_values = cleaned_df_v1.columns[cleaned_df_v1.iloc[0].isna()].tolist()
print(f"\nColumns with NaN values in index 0: {columns_with_empty_values}")

cleaned_df_v1.to_csv("cleaned_df_v1.csv", index=False)
print("\ncleaned_df_v1 has been kindly saved as 'cleaned_df_v1.csv'")

# Step 2) Treatment normaliaztion

In [None]:
# Print total number of rows and columns in cleaned_df_v1
print(f"Total rows: {cleaned_df_v1.shape[0]:,}")
print(f"Total columns: {cleaned_df_v1.shape[1]:,}")

# Find all columns where index 0 contains the word 'treatment'
columns_with_treatment_in_index0 = [
    col for col in cleaned_df_v1.columns
    if 'treatment' in str(cleaned_df_v1.iloc[0][col]).lower() and col not in ignore_columns
]
print("\nColumns with 'treatment' in index 0 (excluding ignored columns):")
for col in columns_with_treatment_in_index0:
    print(col)

In [None]:
# Investigate overall shape
print(f"Total rows: {cleaned_df_v1.shape[0]:,}")
print(f"Total columns: {cleaned_df_v1.shape[1]:,}")

metadata_row = cleaned_df_v1.iloc[0]
metadata_counts = metadata_row.value_counts()

print("\nUnique values in index 0 and how many columns have each:")
print(metadata_counts)

In [None]:
# Copy cleaned_df_v1 into cleaned_df_v2
cleaned_df_v2 = cleaned_df_v1.copy()
exclude_columns = ['PaperId', 'PaperTitle','Abstract']

# Identify columns with metadata "Initial df" in row 0, excluding PaperId and PaperTitle
columns_to_drop = [col for col in cleaned_df_v2.columns if cleaned_df_v2.iloc[0][col] == 'Initial df' and col not in exclude_columns]

# Drop the specified columns in ignore_columns
ignore_columns = [
    'Citations', 'CoFoS', 'Authors', 
    'Language', 'PubYear', 'PubDate', 'BioBERT', 'Sum_Gene_Mentions', 
    'Extracted_Cancer_Terms_old', 'Extracted_Cancer_Terms', 'Cancer_Type_Sum',
    'Mapped_Cancer_Terms', 'Unmatched_Cancer_Terms', 'Remapped_Cancer_Terms', 
    'Final_Mapped_Cancer_Terms', 'Treatment_matches', 'Sum_treatments', 
    'total_variant_count', 'LLM_Prompt', 'LLM_Response', 'Cleaned_Variant_Gene_Pairs'
]

# Add the ignore columns to the list of columns to drop, ensuring the columns are present in the DataFrame
columns_to_drop += [col for col in ignore_columns if col in cleaned_df_v2.columns]
print("Columns to be dropped based on metadata (Initial df) and ignore_columns:")
print(columns_to_drop)

# Calculate the number of columns before and after dropping
columns_before = cleaned_df_v1.shape[1]  
columns_after = cleaned_df_v2.shape[1]

# Drop the identified columns
cleaned_df_v2 = cleaned_df_v2.drop(columns=columns_to_drop)
cleaned_df_v2.reset_index(drop=True, inplace=True)
columns_dropped = columns_before - cleaned_df_v2.shape[1]

print(f"\nNumber of columns in cleaned_df_v1: {columns_before}")
print(f"Number of columns in cleaned_df_v2: {cleaned_df_v2.shape[1]}")
print(f"Total number of columns dropped: {columns_dropped}")

dropped_columns = [col for col in cleaned_df_v1.columns if col not in cleaned_df_v2.columns]
print("\nDropped columns:")
print(dropped_columns)
print("\nCleaned DataFrame (cleaned_df_v2) after dropping columns:")
print(cleaned_df_v2.head())

In [None]:
# Extract unique study designs
unique_study_designs = cleaned_df_v2['Study_design'].unique()

# Create a DataFrame to display the study designs with an index
study_design_df = pd.DataFrame(unique_study_designs, columns=['Study_Design'])
print("Study designs within dataset:")
print(study_design_df)

In [None]:
# Add Study weights

# Create a v3 copy of cleaned_df_v2
cleaned_df_v3 = cleaned_df_v2.copy()

# Define conditions
# From the evidence pyramide
study_design_weights = {
    'Systematic review study': 1.0,      # Highest weight for Systematic Reviews & Meta-Analyses
    'Clinical study': 1.0,               # High-quality clinical studies (RCTs, Cohorts)
    'Observational/RWE study': 0.9,      # Real-world evidence, Cohort studies, Case-control studies
    'Case report study': 0.9,           # Case reports and case series
    'In vivo/Animal study': 0.8,         # Animal studies - mechanistic but not clinical
    'In vitro study': 0.7,               # Lab-based studies - valuable but less direct relevance
    'In silico study': 0.7,             # Computational predictions - hypothesis-generating but unverified
    'Undefined': 0.5                     # Lowest weight for poorly categorized studies
}


# Create a new 'Study_weight' column with default value 'Study' for the metadata row (index 0)
cleaned_df_v3['Study_weight'] = cleaned_df_v3['Study_design'].map(study_design_weights).fillna(0.0)

cleaned_df_v3.loc[0, 'Study_weight'] = 'Study'
columns = cleaned_df_v3.columns.tolist()
study_design_index = columns.index('Study_design')

columns.insert(study_design_index + 1, columns.pop(columns.index('Study_weight')))
cleaned_df_v3 = cleaned_df_v3[columns]

# Generate output
print(cleaned_df_v3.head(10))
output_file = 'cleaned_df_v3.csv'
cleaned_df_v3.to_csv(output_file, index=False)
print(f"\n\nNumber of columns in dataset v2: {len(cleaned_df_v2.columns):,}")
print(f"Number of rows in dataset v2: {len(cleaned_df_v2):,}")
print(f"Number of columns in dataset v3: {len(cleaned_df_v3.columns):,}")
print(f"Number of rows in dataset v3: {len(cleaned_df_v3):,}")
print(f"\n7nFile successfully saved as: '{output_file}'")

# ==============================================

In [None]:
# Extract metadata from row 0 and store it in a mapping DataFrame
cleaned_df_v4 = cleaned_df_v3.copy()
metadata_row = cleaned_df_v4.iloc[0]
metadata_mapping = pd.DataFrame(metadata_row).T

# Drop the metadata row (index 0) from cleaned_df_v4
cleaned_df_v4 = cleaned_df_v4.drop(0)
cleaned_df_v4.reset_index(drop=True, inplace=True)

# Harmonize binary matrix to integers (0 and 1) for all columns except text columns
text_columns = ['PaperId', 'PaperTitle', 'Abstract', 'Study_design',"Study_weight"]

for col in cleaned_df_v4.columns:
    if col not in text_columns:
        cleaned_df_v4[col] = pd.to_numeric(cleaned_df_v4[col], errors='coerce')
        cleaned_df_v4[col] = cleaned_df_v4[col].apply(lambda x: 1 if x == 1.0 else (0 if x == 0.0 else np.nan))

# Generate output
cleaned_df_v4.to_csv('cleaned_df_v4.csv', index=False)
metadata_mapping.to_csv('metadata_mapping.csv', index=False)
print("Metadata Mapping DataFrame:")
print("\nCleaned DataFrame (cleaned_df_v4) after dropping metadata row and harmonizing binary matrix:")
print(cleaned_df_v4.head(3))

In [None]:
# Selecting only the columns except 'PaperId', 'PaperTitle', 'Abstract', and 'Study_design'
subset_df = cleaned_df_v4.drop(columns=['PaperId', 'PaperTitle', 'Abstract', 'Study_design','Study_weight'])

# Checking for columns that contain only 1 and 0
binary_columns = [col for col in subset_df.columns if subset_df[col].dropna().isin([0, 1]).all()]

print("Columns containing only 1 and 0:")
for col in binary_columns:
    print(col)

sums = subset_df[binary_columns].sum()
sorted_sums = sums.sort_values(ascending=False)
print("Sorted list of binary columns by count of 1s (from highest to lowest):")
for col, count in sorted_sums.items():
    print(f"{col}: {count}")

In [None]:
# Validation 1: Check binary columns for consistency (should only contain 0 and 1)
binary_columns = [col for col in cleaned_df_v4.columns if col not in ['PaperId', 'PaperTitle', 'Abstract','Study_design','Study_weight']]

# Check if all values in these columns are either 0 or 1
binary_valid = True
for col in binary_columns:
    if not cleaned_df_v4[col].isin([0, 1]).all():
        print(f"Warning: Column '{col}' contains values other than 0 and 1.")
        binary_valid = False
if binary_valid:
    print("\nAll binary columns are valid (only contain 0 and 1).")

# Validation 2: Check text columns to ensure they don't contain numeric values
text_columns = ['PaperId', 'PaperTitle', 'Abstract', 'Study_design']
text_valid = True
for col in text_columns:
    if cleaned_df_v4[col].apply(lambda x: isinstance(x, (int, float))).any():
        print(f"Warning: Column '{col}' contains numeric values.")
        text_valid = False

if text_valid:
    print("\nAll text columns contain valid string values.")

# Validation 3: Check for missing values
missing_values = cleaned_df_v4.isnull().sum()
if missing_values.any():
    print("\nMissing values found in the following columns:")
    print(missing_values[missing_values > 0])
else:
    print("\nNo missing values found.")
    
    
# Checking for columns that only contain 0s (sum = 0)
zero_only_columns = [col for col in binary_columns if cleaned_df_v4[col].sum() == 0]

if zero_only_columns:
    print("\nColumns that contain only 0s (sum = 0):")
    for col in zero_only_columns:
        print(f" - {col}")
else:
    print("\nNo columns found that only contain 0s (sum = 0).")

print("\nValidation Summary:")
print(f"Binary columns valid: {binary_valid}")
print(f"Text columns valid: {text_valid}")

In [None]:
# Final df and csv
#cleaned_df_v4.to_csv('cleaned_df_v4.csv', index=False)
#metadata_mapping.to_csv('metadata_mapping.csv', index=False)
print(cleaned_df_v4.shape)
print(cleaned_df_v4.head())