# Variant normalization

- Load dataset from LLM ("llama33-70b": "meta-llama/Llama-3.3-70B-Instruct") variant extraction
- Normalization in 3 steps:
    - 1. Clean and normalize based on rules (e.g., merge protein changes such as p.V600E and V600E)
    - 2. Normalize based on CIVIC API and Aliases
    - 3. Normalize based on ClinVar and Ensmbl HGVS Notations
   
   
- Create binary matrix (columns have been merged, so dulicated variants are removed)
- Check which proportion is related to Oncomine genes and prostate cancer genes 
- Output figure of extracted variants

# 1) Set up libraries and datasets

In [None]:
# Import libraries
import os
import sys
import re
import math
import time
import logging
import pprint

from datetime import datetime, timedelta
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import requests
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm

print("Success!")


In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
LLM_directory = "LLM_DIRECTORY"

os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

# 2) Rule-based normalization and matrix creation

In [None]:
# Copmute total runtime
# Define input and output file paths
input_file = "final_runtime_summary_llama33-70b_prompt3.txt"
output_file = "final_runtime_summary_statistics_variant_extraction_llama33-70b_prompt3.txt"
batch_runtimes = []
with open(input_file, "r", encoding="utf-8") as file:
    for line in file:
        match = re.search(r'Batch Runtime:\s*(\d+)', line)
        if match:
            batch_runtimes.append(int(match.group(1)))

total_seconds = sum(batch_runtimes)
minutes, seconds = divmod(total_seconds, 60)
hours, minutes = divmod(minutes, 60)
days, hours = divmod(hours, 24)

if seconds >= 30: minutes += 1
if minutes >= 60: hours, minutes = hours + 1, 0
if hours >= 24: days, hours = days + 1, 0

output_content = [f"\n--- Batch execution times ---"] + \
                 [f"Batch {i}: {runtime:,} sec" for i, runtime in enumerate(batch_runtimes, start=1)] + \
                 [f"\n--- Summary statistics ---",
                  f"Total execution time: {total_seconds:,} sec",
                  f"Total execution time: {math.ceil(total_seconds / 60):,} min",
                  f"Total execution time: {math.ceil(total_seconds / 3600):,} h",
                  f"Total execution time: {' '.join(filter(None, [f'{days}d' if days else '', f'{hours}h' if hours else '', f'{minutes} min' if minutes else '']))}"]

with open(output_file, "w", encoding="utf-8") as out_file:
    out_file.write("\n".join(output_content) + "\n")

print("\n".join(output_content))
print(f"\n Saved in '{output_file}'.")

In [None]:
#####################################################################################
### 1 Load dataset with LLM-identified variants and extract to pairs ###
#####################################################################################

# Load dataset
os.chdir(LLM_directory)
file_path = "final_LLM_variant_extraction_llama33-70b_prompt3.csv"
initial_variant_df = pd.read_csv(file_path)
print("Dataset loaded")
os.chdir(LLM_directory)


initial_rows= len(initial_variant_df)
initial_columns = len(initial_variant_df.columns)

print(f"Length of dataset: {initial_rows:,} rows")
print(f"Length of columns: {initial_columns:,} columns before variant and gene extraction")

variant_df = initial_variant_df.copy()

# Extract gene and variant pairs
variant_df["PaperId"] = pd.to_numeric(variant_df["PaperId"], errors='coerce').fillna(0).astype('int64')
variant_column = "LLM_Response"

def extract_variant_gene_pairs(text):
    if pd.isna(text):
        return []
    text = text.replace("\n", " ")
    matches = re.findall(r"Variant:\s*([\w\d\.\*-]+),\s*Gene:\s*([\w\d]+)", text)
    return matches if matches else []

variant_df["Variant_Gene_Pairs"] = variant_df[variant_column].apply(extract_variant_gene_pairs)

max_variants = variant_df["Variant_Gene_Pairs"].apply(lambda x: len(set(x)) if isinstance(x, list) else 0).max()
total_tuples = variant_df["Variant_Gene_Pairs"].apply(len).sum()

print(f"\n\nAfter variant and gene tuple extraction")
print(f"Max number of unique variant-gene pairs in a row: {max_variants}")
print(f"Total variant-gene tuples extracted across all rows: {total_tuples:,}")
print(f"Length of columns: {len(variant_df.columns):,} columns (+1 added)")

#####################################################################################
### 2 Clean and deduplicate variant-gene pairs ###
#####################################################################################

# --- Helper Functions for Normalization ---
def remove_hgvs_prefix(variant):
    return re.sub(r"^[cp]\.", "", variant, flags=re.IGNORECASE)

aa_3_to_1 = {
    "val": "V", "gly": "G", "glu": "E", "asp": "D", "thr": "T", "met": "M", 
    "ala": "A", "leu": "L", "ser": "S", "pro": "P", "cys": "C", "phe": "F",
    "his": "H", "lys": "K", "asn": "N", "tyr": "Y", "trp": "W", "ile": "I", 
    "arg": "R", "gln": "Q"
}

def normalize_amino_acids(variant):
    aa3 = aa_3_to_1
    pattern = re.compile(rf"(?i)\b({'|'.join(aa3.keys())})(\d+)({'|'.join(aa3.keys())})\b")

    def replacer(match):
        ref = match.group(1).lower()
        pos = match.group(2)
        alt = match.group(3).lower()
        return f"{aa3[ref]}{pos}{aa3[alt]}"

    variant = pattern.sub(replacer, variant)
    return variant


exon_map = {
    "exon19del": ["19del", "del19", "ex19del", "exon19deletion"],
    "exon20ins": ["ex20ins", "ins20", "exon20insertion", "ex20insertion"]
}

def map_exon_variant(variant):
    variant_lower = variant.lower()
    for canonical, variants in exon_map.items():
        if any(v in variant_lower for v in variants):
            return canonical
    return variant

# Cleaning configuration
unwanted_variants = {"vus", "c.", "loss", "indel"}  
remove_if_contains = {"mutation", "mutated", "mut", "mutant", "sensitive", 
                      "deficient", "null", "amplified", "deficiency", "deletion",
                      "loxp", "loss", "knockdown", "flox"}  
max_variant_length = 40  
max_gene_length = 10  
min_variant_length = 2
min_gene_length = 2
specific_variant_gene_removals = {("p53", "TP53")}

# Define gene alias map
gene_alias_map.update({
    "CASPASE": "CASP1",
    "CBRAF": "BRAF",
    "CHECK2": "CHEK2",
    "CKIT": "KIT",
    "CMET": "MET",
    "CMYC": "MYC",
    "CTNNB": "CTNNB1",
    "CYP19": "CYP19A1",
    "EGFR2": "ERBB2",
    "ER": "ESR1",
    "ERB": "ERBB2",
    "ERB2": "ERBB2",
    "ERBB": "ERBB2",
    "ERRB2": "ERBB2",
    "ERRB3": "ERBB3",
    "ERRB4": "ERBB4",
    "ESR": "ESR1",
    "FACND2": "FANCD2",
    "FCR": "FCGR3A",
    "FCRIIIA": "FCGR3A",
    "FP": "PTGFR",
    "GAL": "GAL1",
    "GALECTIN": "LGALS1",
    "GCSF": "CSF3",
    "GLUT3": "SLC2A3",
    "GNB2L1": "RACK1",
    "GNAS1": "GNAS",
    "GP130": "IL6ST",
    "GPX": "GPX1",
    "GQ": "GNAQ",
    "HER-2": "ERBB2",
    "HER-3": "ERBB3",
    "HER-4": "ERBB4",
    "HER2": "ERBB2",
    "HER2-NEU": "ERBB2",
    "HER2/NEU": "ERBB2",
    "HER2NEU": "ERBB2",
    "HER3": "ERBB3",
    "HER4": "ERBB4",
    "TELOMERASE": "TERT"
})

# Set of known "non-genes" or invalid gene terms
invalid_gene_set = {
    "ANDROGEN", "CYP450", "CATENIN", "CSF", "14","2Q35", "6Q25", "8Q24", "9Q31", "VARIANT",
    "FCR", "FCRIIIA", "GCSF",     # cytokine name, not symbol
    "GAL",      # ambiguous (Galectin or Galanin?)
    "GALECTIN", # not a gene symbol
    "GQ",       # G protein alpha-q, map or remove
    "ANDROGEN", "CYP450","CATENIN",
    "2Q35", "6Q25", "8Q24", "9Q31", "FNACA", "FOPNL",
    "FP", "GM", "G11",
    "VITAMIN",
    "ISOCITRATE",
    "MICRORNA",
    "LOC146880", "LOC643714", "LOC730100",
    "HISTONE3",
    "CIRCHIBADH",
    "SURVIVIN",
    "LINC00951", "LINC01614", "LINC02183", "LINC02869"
}



# Define main cleaning function
def clean_variant_gene_pairs(pairs):
    cleaned_pairs = []
    seen_pairs = set()

    for variant, gene in pairs:
        gene_clean = gene.strip().upper()
        gene_clean = re.sub(r"[-_\s]", "", gene_clean)
        gene_clean = gene_alias_map.get(gene_clean, gene_clean)
        if gene_clean in invalid_gene_set:
            continue
        variant_clean = variant.strip()

        # Remove p53 prefix if gene is TP53
        if gene_clean == "TP53" and variant_clean.lower().startswith("p53"):
            variant_clean = variant_clean[3:]

        # Normalize amino acids (e.g., Val600 → V600)
        variant_clean = normalize_amino_acids(variant_clean)

        # Map exon patterns (e.g., ex20ins → exon20ins)
        variant_clean = map_exon_variant(variant_clean)

        # Remove HGVS prefixes like "c.", "p."
        variant_clean = re.sub(r"^[cp]\.", "", variant_clean, flags=re.IGNORECASE)

        # Remove symbols and lowercase it
        variant_clean = re.sub(r"[-_\s\.]", "", variant_clean).lower()

        # Skip if variant == gene
        if variant_clean == gene_clean.lower():
            continue

        # Filter rules
        if len(variant_clean) < min_variant_length:
            continue
        if len(gene_clean) < min_gene_length:
            continue
        if (variant_clean, gene_clean) in specific_variant_gene_removals:
            continue
        if variant_clean in unwanted_variants:
            continue
        if any(term in variant_clean for term in remove_if_contains):
            continue
        if gene_clean in {"NONE", "NO", "NOT", "UNKNOWN", ""}:
            continue
        if len(variant_clean) > max_variant_length or len(gene_clean) > max_gene_length:
            continue
        if variant_clean == "none":
            continue

        # Deduplicate
        pair_key = (variant_clean, gene_clean)
        if pair_key not in seen_pairs:
            seen_pairs.add(pair_key)
            cleaned_pairs.append(pair_key)

    return cleaned_pairs

variant_df["Cleaned_Variant_Gene_Pairs"] = variant_df["Variant_Gene_Pairs"].progress_apply(clean_variant_gene_pairs)
cleaned_rows, cleaned_cols = variant_df.shape
print(f"After initial rule-based cleaning: {cleaned_rows:,} rows, {cleaned_cols:,} columns.")
total_cleaned_tuples = variant_df["Cleaned_Variant_Gene_Pairs"].apply(len).sum()
print(f"Total variant-gene tuples after cleaning: {total_cleaned_tuples:,}")

#####################################################################################
### 3 Expand variant and gene pairs into separate columns for matrix creation ###
#####################################################################################
variant_columns = [f"Variant_{i+1}" for i in range(max_variants)]
gene_columns = [f"Gene_{i+1}" for i in range(max_variants)]
all_columns = [col for pair in zip(variant_columns, gene_columns) for col in pair]

expanded_data = []
for pairs in tqdm(variant_df["Cleaned_Variant_Gene_Pairs"], desc="Expanding variant-gene pairs"):
    row_values = sum(([v, g] for v, g in pairs), [])  
    row_values += [np.nan] * (len(all_columns) - len(row_values))  
    expanded_data.append(row_values)

expanded_df = pd.DataFrame(expanded_data, columns=all_columns)
expanded_df.index = variant_df.index
variant_df = pd.concat([variant_df, expanded_df], axis=1)

#####################################################################################
### Variant Name Cleaning (After Expansion) for Multiple Genes ###
#####################################################################################
def standardize_variant_gene_names(row):
    for i in range(max_variants):
        variant_col = f"Variant_{i+1}"
        gene_col = f"Gene_{i+1}"

        variant_value = row.get(variant_col)
        gene_value = row.get(gene_col)

        if pd.notna(variant_value):
            variant = str(variant_value).strip().lower()
            variant = re.sub(r"[-_\s]", "", variant)
            row[variant_col] = variant

        if pd.notna(gene_value):
            gene = str(gene_value).strip().upper()
            row[gene_col] = gene
    return row

def remove_gene_from_variant(row, max_variants):
    for i in range(max_variants):
        variant_col = f"Variant_{i+1}"
        gene_col = f"Gene_{i+1}"

        variant_value = row.get(variant_col)
        gene_value = row.get(gene_col)

        if pd.notna(gene_value) and pd.notna(variant_value):
            gene = str(gene_value).strip().upper()
            variant = str(variant_value).strip()

            if variant.upper().startswith(f"{gene}-") and not re.match(rf"^{gene}[-][*\.]", variant, re.IGNORECASE):
                row[variant_col] = variant[len(gene) + 1:]
            elif variant.upper().startswith(gene) and not re.match(rf"^{gene}[*\.]", variant, re.IGNORECASE):
                row[variant_col] = variant[len(gene):]
    return row

def remove_alias_from_variant(row, gene_alias_map, max_variants):
    alias_reverse_map = defaultdict(list)
    for alias, canonical in gene_alias_map.items():
        alias_reverse_map[canonical].append(alias.lower())
    for i in range(max_variants):
        variant_col = f"Variant_{i+1}"
        gene_col = f"Gene_{i+1}"
        variant = row.get(variant_col)
        gene = row.get(gene_col)
        if pd.notna(variant) and pd.notna(gene):
            gene = gene.strip().upper()
            variant = variant.strip().lower()
            aliases = alias_reverse_map.get(gene, [])
            for alias in aliases:
                if variant.startswith(alias):
                    row[variant_col] = variant[len(alias):]
                    break
    return row

variant_df = variant_df.progress_apply(standardize_variant_gene_names, axis=1)
variant_df = variant_df.progress_apply(remove_gene_from_variant, axis=1, args=(max_variants,))
print("Applied variant name standardization and gene name removal after expansion for BRAF, EGFR, TP53, and others.")
variant_df = variant_df.progress_apply(remove_alias_from_variant, axis=1, args=(gene_alias_map, max_variants))
print("Removed alias prefixes (e.g., HER2) from variants based on mapped gene.")

#####################################################################################
### Remove short genes and variants again ###
#####################################################################################
def final_remove_short_entries(row):
    for i in range(max_variants):
        variant_col = f"Variant_{i+1}"
        gene_col = f"Gene_{i+1}"
        if variant_col in row and pd.notna(row[variant_col]) and len(row[variant_col]) < min_variant_length:
            row[variant_col] = np.nan
        if gene_col in row and pd.notna(row[gene_col]) and len(row[gene_col]) < min_gene_length:
            row[gene_col] = np.nan
    return row
variant_df = variant_df.progress_apply(final_remove_short_entries, axis=1)
print("Final cleaning: Removed short variants and genes after all processing.")

#####################################################################################
### 4 Create binary matrix in the next cell! ###
#####################################################################################


In [None]:
# Collect and print all unique Variant_n / Gene_n combinations

variant_cols = [col for col in variant_df.columns if col.startswith("Variant_")]
gene_cols = [col for col in variant_df.columns if col.startswith("Gene_")]
variant_gene_set = set()
for variant_col, gene_col in zip(variant_cols, gene_cols):
    pairs = variant_df[[variant_col, gene_col]].dropna()
    for variant, gene in zip(pairs[variant_col], pairs[gene_col]):
        if isinstance(variant, str) and isinstance(gene, str):
            cleaned_variant = variant.strip().lower()
            cleaned_gene = gene.strip().upper()
            variant_gene_set.add((cleaned_variant, cleaned_gene))
unique_variant_gene_pairs = sorted(variant_gene_set)
print("\n Unique (Variant, Gene) pairs found in Variant_n / Gene_n columns:")
for variant, gene in unique_variant_gene_pairs:
    print(f"{variant} — {gene}")
print(f"\n Total unique Variant-Gene pairs: {len(unique_variant_gene_pairs):,}")

In [None]:
# Collect and print all unique Genes from Gene_n columns
gene_cols = [col for col in variant_df.columns if col.startswith("Gene_")]
unique_genes = set()

for gene_col in gene_cols:
    genes = variant_df[gene_col].dropna()
    for gene in genes:
        if isinstance(gene, str):
            cleaned_gene = gene.strip().upper()
            unique_genes.add(cleaned_gene)
unique_genes_sorted = sorted(unique_genes, key=lambda x: (len(x), x))
print("\n Unique gene names found across Gene_n columns (sorted by length):\n")
for gene in unique_genes_sorted:
    print(gene)
print(f"\n Total unique genes: {len(unique_genes_sorted):,}")

In [None]:
#####################################################################################
### 4 Create binary matrix ###
#####################################################################################
# Initialize the binary matrix columns with pre-allocated values
variant_gene_pairs = []
for index, row in tqdm(variant_df.iterrows(), total=len(variant_df), desc="Collecting variant-gene IDs"):
    for variant_col, gene_col in zip(variant_columns, gene_columns):
        if pd.notna(row[variant_col]) and pd.notna(row[gene_col]):
            variant = str(row[variant_col]).strip().lower()
            gene = str(row[gene_col]).strip().upper()
            variant_gene_id = f"{variant}_{gene}"
            if variant_gene_id not in variant_gene_pairs:
                variant_gene_pairs.append(variant_gene_id)
print(f"Total unique variant-gene columns: {len(variant_gene_pairs):,}")

# Pre-allocate the binary matrix with the correct columns
binary_matrix = pd.DataFrame(0, index=variant_df.index, columns=variant_gene_pairs)

# Iterate through each row and mark the presence of variant-gene pairs
for index, row in tqdm(variant_df.iterrows(), total=len(variant_df), desc="Building binary matrix"):
    seen_pairs = set()
    
    for variant_col, gene_col in zip(variant_columns, gene_columns):
        if pd.notna(row[variant_col]) and pd.notna(row[gene_col]):
            variant = str(row[variant_col]).strip().lower()
            gene = str(row[gene_col]).strip().upper()
            variant_gene_id = f"{variant}_{gene}"

            if variant_gene_id not in seen_pairs:
                binary_matrix.at[index, variant_gene_id] = 1
                seen_pairs.add(variant_gene_id)

binary_matrix["PaperId"] = variant_df["PaperId"]
print("Binary matrix created")

In [None]:
# Investiagte bianry matrix
cols = binary_matrix.columns.tolist()
if len(cols) == len(set(cols)):
    print("All column names are unique.")
else:
    print("Duplicate column names found!")
    # Print duplicates
    duplicates = [col for col, count in Counter(cols).items() if count > 1]
    print(f"Duplicate columns ({len(duplicates)}):")
    for col in duplicates:
        print(f"  {col}")    
binary_cols = [col for col in binary_matrix.columns if col != "PaperId"]
non_zero_columns = (binary_matrix[binary_cols].sum(axis=0) >= 1)
active_count = non_zero_columns.sum()
inactive_count = len(binary_cols) - active_count
print(f"Total binary variant-gene columns: {len(binary_cols):,}")
print(f"Columns with at least one 1:       {active_count:,}")
print(f"Columns with all zeros:            {inactive_count:,}")

In [None]:
#####################################################################################
### 5 Merge binary matrix with dataset and add total_variant_count ###
#####################################################################################
tqdm.pandas(desc="Merging and calculating total variant count")

final_variant_matrix_df = variant_df.merge(binary_matrix, on="PaperId", how="left")
binary_columns = [col for col in binary_matrix.columns if col != "PaperId"]
final_variant_matrix_df["total_variant_count"] = final_variant_matrix_df[binary_columns].progress_apply(
    lambda row: row.sum(), axis=1
)
print("Binary matrix created and merged with PaperId!")

In [None]:
#####################################################################################
################################ 6 FINAL SUMMARY ####################################
#####################################################################################
# Count statistics using 'total_variant_count'
rows_with_variants = (final_variant_matrix_df["total_variant_count"] >= 1).sum()
rows_without_variants = (final_variant_matrix_df["total_variant_count"] == 0).sum()
percentage_with_variants = (rows_with_variants / initial_rows) * 100
percentage_without_variants = (rows_without_variants / initial_rows) * 100

matrix_output_path = os.path.join(LLM_directory, "full_variant_dataframe_matrix.csv")

chunk_size = 10000
num_chunks = math.ceil(len(final_variant_matrix_df) / chunk_size)

with open(matrix_output_path, mode='w', newline='', encoding='utf-8') as f:
    for i, chunk_start in enumerate(tqdm(range(0, len(final_variant_matrix_df), chunk_size), desc="Saving CSV in chunks")):
        chunk_end = chunk_start + chunk_size
        chunk = final_variant_matrix_df.iloc[chunk_start:chunk_end]
        header = (i == 0) 
        chunk.to_csv(f, index=False, header=header)
        
print("\n##################################### FINAL SUMMARY ######################################")
print(f" Initial dataset: {initial_rows:,} rows, {initial_columns:,} columns.")
print(f" After binary matrix creation: {len(final_variant_matrix_df):,} rows, {len(final_variant_matrix_df.columns):,} columns.")

print("\n----------------- Variant Count Statistics -----------------")
print(f" Rows with at least 1 variant: {rows_with_variants:,} ({percentage_with_variants:.2f}%)")
print(f" Rows with 0 variants: {rows_without_variants:,} ({percentage_without_variants:.2f}%)")
print(f" Total variants across all rows: {final_variant_matrix_df['total_variant_count'].sum():,}")

print(f"\n\nLength of rows:      {len(final_variant_matrix_df):,}")
print(f"Length of columns:    {len(final_variant_matrix_df.columns):,}")
print(final_variant_matrix_df)

print("\nProcessing completed successfully. Final dataset saved as: full_variant_dataframe_matrix.csv")
print("############################################################################################")

# Variant normalzation

In [None]:
print(final_variant_matrix_df)

## 1) Variant normalzation with CIVIC Aliases

In [None]:
os.chdir(LLM_directory)
non_binary_fields = {
    "PaperId",
    "PaperTitle",
    "Abstract",
    "LLM_Prompt",
    "LLM_Response",
    "Variant_Gene_Pairs",
    "Cleaned_Variant_Gene_Pairs",
    "total_variant_count"
}

- 1) Fetch CIVIC "Variant" information (Variand ID, Variant Name, Aliases) --> common comparator: "Variant ID"
- 2) Fetch CIVIC "Molecular profiles" information (ID, Name, Descirption, Scroe, Variants, Assertion)  --> common comparator: "ID"
- 3) Fetch CIVIC "Gene" information (Molecular Profile ID but doudplcates!) -->

In [None]:
# Load matrix only if not already in memory
if "final_variant_matrix_df" not in globals():
    print("Loading binary matrix from CSV...")
    final_variant_matrix_df = pd.read_csv("full_variant_dataframe_matrix.csv", index_col=0, low_memory=False)
else:
    print("Using existing 'final_variant_matrix_df' from memory.")
    
# Filter the DataFrame to include only rows where 'total_variant_count' is >= 1
filtered_df = final_variant_matrix_df[final_variant_matrix_df["total_variant_count"] >= 1]
num_rows = len(filtered_df)
print(f"Number of all rows: {len(final_variant_matrix_df)}")
print(f"Number of rows where 'total_variant_count' >= 1: {num_rows:,}")

# Save the filtered DataFrame as a new variable and export to CSV
filtered_variant_matrix_df = filtered_df.copy()  # Save as a new variable
filtered_variant_matrix_df.to_csv("filtered_variant_matrix_df.csv")
print("Filtered DataFrame saved successfully as 'filtered_variant_matrix_df.csv'.")

# Detect true binary matrix columns (variant_gene format + binary values)
non_binary_fields = ["PaperId", "PaperTitle", "Abstract", "total_variant_count"]

binary_columns = [
    col for col in filtered_variant_matrix_df.columns
    if col not in non_binary_fields
    and isinstance(col, str)
    and re.match(r"^[a-z0-9\*\.\-]+_[A-Z0-9]+$", col)
    and filtered_variant_matrix_df[col].dropna().isin([0, 1]).all()
]

# Sum variant-gene mentions and group by variant name (before "_")
binary_matrix_sums = filtered_variant_matrix_df[binary_columns].sum()
variant_names = binary_matrix_sums.index.to_series().apply(lambda x: x.rsplit("_", 1)[0])
variant_counts = binary_matrix_sums.groupby(variant_names).sum().sort_values(ascending=False)
variant_counts_df = variant_counts.to_frame(name="Binary Matrix Mentions")


variant_counts_df.to_csv("variant_counts_summary.csv")
print("Variant counts summary saved successfully as 'variant_counts_summary.csv'.")
print("\n--------- Variant Mention Summary ---------")
print(f"Unique variants detected: {len(variant_counts):,}")
print(f"Total variant-gene mentions: {binary_matrix_sums.sum():,}")
print("\nTop 20 variants:")
print(variant_counts_df.head(20).to_string())

In [None]:
### Fetch Variant information
# Helper method to run the API query
def run_query(query, variables):
    request = requests.post('https://civicdb.org/api/graphql', json={'query': query, 'variables': variables})
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception(f"Query failed with code {request.status_code}. {query}")

# The GraphQL query to fetch variants and their aliases
query = """
query variants($after: String) {
  variants(after: $after) {
    pageInfo {
      hasNextPage
      endCursor
    }
    nodes {
      id
      name
      variantAliases
    }
  }
}
"""

# Pagination settings
hasNextPage = True
previousPageEnd = None
variant_data = []
total_variants = 10000 

# Progress Bar
with tqdm(total=total_variants, desc="Fetching Variants", unit="variants") as pbar:
    while hasNextPage:
        variables = {"after": previousPageEnd}
        resp = run_query(query, variables)
        data = resp['data']['variants']
        
        # Collect variant names and aliases
        for variant in data['nodes']:
            aliases = variant.get('variantAliases', [])
            variant_data.append({
                'Variant ID': variant['id'],
                'Variant Name': variant['name'],
                'Aliases': ", ".join(aliases) if aliases else "None"
            })

        # Update progress bar
        pbar.update(len(data['nodes']))

        # Check if there are more pages
        hasNextPage = data['pageInfo']['hasNextPage']
        previousPageEnd = data['pageInfo']['endCursor']

# Convert collected data to DataFrame
CIVIC_variants_df = pd.DataFrame(variant_data)
pprint.pprint(CIVIC_variants_df.head())
csv_filename = "variants_and_aliases_CIVIC.csv"
CIVIC_variants_df.to_csv(csv_filename, index=False)
print(f"\nCSV saved as {csv_filename}")
print(f"\nTotal variants collected: {len(CIVIC_variants_df)}")
print("\nTable of variants and aliases:")
print(CIVIC_variants_df.head()) 

In [None]:
# Fetch molecular profiles

# API Endpoint and Headers
url = "https://civicdb.org/api/graphql"
headers = {"Content-Type": "application/json"}

# GraphQL Query
query = """
query browseMolecularProfiles($after: String) {
  molecularProfiles(first: 300, after: $after) {
    edges {
      node {
        id
        name
        description
        molecularProfileScore
        variants {
          id
          name
        }
        assertions {
          nodes{
            id
            name
            description
            disease{
              id
              name
            } 
          }
        } 
      }
    }
    pageInfo {
      endCursor
      hasNextPage
    }
    totalCount
  }
}
"""

# Fetch total count first for progress bar
response = requests.post(url, json={'query': query, 'variables': {"after": None}}, headers=headers)
response_json = response.json()
total_profiles = response_json.get("data", {}).get("molecularProfiles", {}).get("totalCount", 0)

# Initialize variables for fetching data
all_molecular_profiles = []
variables = {"after": None}

# Progress Bar
with tqdm(total=total_profiles, desc="Fetching Molecular Profiles", unit="profiles") as pbar:
    while True:
        response = requests.post(url, json={'query': query, 'variables': variables}, headers=headers)
        response_json = response.json()
        
        if 'data' in response_json:
            molecular_profiles = response_json["data"]["molecularProfiles"]["edges"]
            all_molecular_profiles.extend(molecular_profiles)
            
            # Update progress bar
            pbar.update(len(molecular_profiles))

            # Check if more pages exist
            page_info = response_json["data"]["molecularProfiles"]["pageInfo"]
            if not page_info["hasNextPage"]:
                break
            variables["after"] = page_info["endCursor"]
        else:
            print("Error in response:", response_json.get('errors'))
            break

print(f"\nTotal profiles fetched: {len(all_molecular_profiles)}")

# Convert Data to DataFrame
data = []
for profile in all_molecular_profiles:
    node = profile["node"]
    profile_id = node["id"]
    name = node["name"]
    description = node["description"]
    score = node["molecularProfileScore"]
    
    variants = "; ".join([v["name"] for v in node.get("variants", [])])
    
    assertions = []
    for assertion in node.get("assertions", {}).get("nodes", []):
        assertion_id = assertion["id"]
        assertion_name = assertion["name"]
        assertion_desc = assertion["description"]
        disease_name = assertion["disease"]["name"] if assertion["disease"] else "N/A"
        assertions.append(f"{assertion_id}: {assertion_name} ({disease_name})")
    
    assertions_text = " | ".join(assertions)
    
    data.append([profile_id, name, description, score, variants, assertions_text])

# Create DataFrame with a specific variable name
molecular_profiles_df = pd.DataFrame(
    data, columns=["ID", "Name", "Description", "Score", "Variants", "Assertions"]
)

csv_filename = "molecular_profiles_CIVIC.csv"
molecular_profiles_df.to_csv(csv_filename, index=False)
print(f"CSV saved as {csv_filename}")

In [None]:
### Extract genes as CSV

# API Endpoint and Headers
url = "https://civicdb.org/api/graphql"
headers = {"Content-Type": "application/json"}

# GraphQL Query for Genes
query = """
query browseGenes($after: String) {
    genes(first: 300, after: $after) {
        nodes {
            id
            name
            description
            variants {
                nodes {
                    id
                    name
                    molecularProfiles {
                        nodes {
                            id
                            name
                            description
                            assertions {
                                nodes {
                                    id
                                    name
                                    description
                                }
                            }
                        }
                    }
                }
            }
        }
        pageInfo {
            endCursor
            hasNextPage
        }
        totalCount
    }
}
"""

# Fetch total count first for progress bar
response = requests.post(url, json={'query': query, 'variables': {"after": None}}, headers=headers)
response_json = response.json()
total_genes = response_json.get("data", {}).get("genes", {}).get("totalCount", 0)

# Initialize variables for fetching data
all_genes = []
variables = {"after": None}

# Progress Bar
with tqdm(total=total_genes, desc="Fetching Genes", unit="genes") as pbar:
    while True:
        response = requests.post(url, json={'query': query, 'variables': variables}, headers=headers)
        response_json = response.json()
        
        if 'data' in response_json:
            genes = response_json["data"]["genes"]["nodes"]
            all_genes.extend(genes)
            
            # Update progress bar
            pbar.update(len(genes))

            # Check if more pages exist
            page_info = response_json["data"]["genes"]["pageInfo"]
            if not page_info["hasNextPage"]:
                break
            variables["after"] = page_info["endCursor"]
        else:
            print("Error in response:", response_json.get('errors'))
            break

print(f"\nTotal genes fetched: {len(all_genes)}")

# Convert Data to DataFrame
data = []
for gene in all_genes:
    gene_id = gene["id"]
    gene_name = gene["name"]
    gene_description = gene["description"]
    
    # Handle nested variants
    for variant in gene.get("variants", {}).get("nodes", []):
        variant_id = variant["id"]
        variant_name = variant["name"]
        
        # Handle nested molecular profiles
        for profile in variant.get("molecularProfiles", {}).get("nodes", []):
            profile_id = profile["id"]
            profile_name = profile["name"]
            profile_desc = profile["description"]

            # Handle assertions
            assertions = []
            for assertion in profile.get("assertions", {}).get("nodes", []):
                assertion_id = assertion["id"]
                assertion_name = assertion["name"]
                assertion_desc = assertion["description"]
                assertions.append(f"{assertion_id}: {assertion_name} ({assertion_desc})")

            assertions_text = " | ".join(assertions) if assertions else "None"

            # Append row to dataset
            data.append([
                gene_id, gene_name, gene_description,
                variant_id, variant_name,
                profile_id, profile_name, profile_desc,
                assertions_text
            ])

# Create DataFrame with a meaningful name
genes_df = pd.DataFrame(data, columns=[
    "Gene ID", "Gene Name", "Gene Description",
    "Variant ID", "Variant Name",
    "Molecular Profile ID", "Molecular Profile Name", "Molecular Profile Description",
    "Assertions"
])

# Save DataFrame to CSV
csv_filename = "genes_CIVIC.csv"
genes_df.to_csv(csv_filename, index=False)
print(f"CSV saved as {csv_filename}")


In [None]:
# Merging datasets of variants and molecular profiles

variants_df = pd.read_csv("variants_and_aliases_CIVIC.csv")
print(f"Length of variants_df: {len(variants_df):,}")

molecular_profiles_df = pd.read_csv("molecular_profiles_CIVIC.csv")
print(f"Length of molecular_profiles_df: {len(molecular_profiles_df):,}")

molecular_profiles_df.rename(columns={"ID": "Variant ID"}, inplace=True)
CIVIC_variant_merged_df = variants_df.merge(molecular_profiles_df, on="Variant ID", how="outer")
merged_csv_filename = "CIVIC_merged_variants_molprofiles.csv"
CIVIC_variant_merged_df.to_csv(merged_csv_filename, index=False)
print(f"Merged CSV saved as: {merged_csv_filename}")
print(f"Total rows: {len(CIVIC_variant_merged_df):,}")

selected_columns = ["Variant Name", "Variants", "Name", "Aliases"]
relevant_columns = [col for col in selected_columns if col in CIVIC_variant_merged_df.columns]


In [None]:
# Merging with gene dataframe

CIVIC_variant_merged_df = pd.read_csv("CIVIC_merged_variants_molprofiles.csv")
genes_df = pd.read_csv("genes_CIVIC.csv")
genes_selected_columns = ["Gene ID", "Gene Name", "Gene Description", "Molecular Profile Name", "Molecular Profile ID"]
genes_df = genes_df[genes_selected_columns]
CIVIC_final_merged_df = CIVIC_variant_merged_df.merge(
    genes_df, 
    left_on=["Name", "Variant ID"], 
    right_on=["Molecular Profile Name", "Molecular Profile ID"], 
    how="left"
)
CIVIC_final_merged_df.drop(columns=["Molecular Profile Name", "Molecular Profile ID"], inplace=True)
final_merged_csv_filename = "CIVIC_merged_variants_molprofiles_genes.csv"
CIVIC_final_merged_df.to_csv(final_merged_csv_filename, index=False)

print(f"Final merged CSV saved as: {final_merged_csv_filename}")
print(f"Total rows: {len(CIVIC_final_merged_df):,}")

# Normalize with CIVIC

In [None]:
# Processing binary matrix and matching with CIVIC data

# Load datasets
CIVIC_merged_df = pd.read_csv("CIVIC_merged_variants_molprofiles_genes.csv")
matrix_file = "filtered_variant_matrix_df.csv"
final_variant_matrix_df = pd.read_csv(matrix_file, index_col=0, low_memory=False)
print(f"Loaded matrix with {len(final_variant_matrix_df):,} rows and {len(final_variant_matrix_df.columns):,} columns.")

# Remove columns starting with "Variant_" or "Gene_"
columns_to_remove = [col for col in final_variant_matrix_df.columns if col.startswith("Variant_") or col.startswith("Gene_")]
if columns_to_remove:
    final_variant_matrix_df.drop(columns=columns_to_remove, inplace=True)
    print(f"\nRemoved {len(columns_to_remove)} columns starting with 'Variant_' or 'Gene_'.")

# Select and detect valid binary columns
metadata_columns = [
    "PaperId", "PaperTitle", "Abstract", "LLM_Prompt", "LLM_Response",
    "Variant_Gene_Pairs", "Cleaned_Variant_Gene_Pairs", "total_variant_count"
]
binary_columns = [
    col for col in final_variant_matrix_df.columns 
    if col not in metadata_columns
    and isinstance(col, str)
    and re.match(r"^[a-zA-Z0-9\*\.\-]+_[A-Z0-9]+$", col)
]
print(f"\nDetected {len(binary_columns):,} binary columns for processing.")

# Convert binary columns to numeric (0s and 1s)
for col in binary_columns:
    final_variant_matrix_df[col] = pd.to_numeric(final_variant_matrix_df[col], errors='coerce').fillna(0).astype(int)

# Sum the binary columns and total count
binary_matrix_sums = final_variant_matrix_df[binary_columns].sum()
total_binary_matrix_count = binary_matrix_sums.sum()
print(f"\nTotal mentions across all variants: {total_binary_matrix_count:,}")

# Extract and process variant names
extracted_variants = binary_matrix_sums.index.to_series().apply(lambda x: re.split(r'_(?=[^_]+$)', x)[0])
print("\nSample of extracted variant names:")
print(extracted_variants.head(20))

# Match with CIVIC dataset
extracted_variant_set = {variant.lower() for variant in extracted_variants.tolist()}
matches = []
for _, row in CIVIC_merged_df.iterrows():
    variant_name = str(row["Variant Name"]).strip().lower() if pd.notna(row["Variant Name"]) else ""
    aliases = row["Aliases"] if pd.notna(row["Aliases"]) else ""
    alias_list = [alias.strip().lower() for alias in aliases.split(", ") if alias]
    matched_aliases = [alias for alias in alias_list if alias in extracted_variant_set]
    if variant_name in extracted_variant_set or matched_aliases:
        matches.append({
            "Variant Name": variant_name,
            "Matched Aliases": ", ".join(matched_aliases)
        })

matched_variants_df = pd.DataFrame(matches)
print(f"\nNumber of matches found: {len(matched_variants_df)}")
print("\nPreview of matched variants:")
print(matched_variants_df.head(50))
matched_variants_df.to_csv("matched_variants_and_aliases.csv", index=False)
print("\nMatched variants saved to 'matched_variants_and_aliases.csv'")

In [None]:
# Normalization and merging of CIViC Synonyms
matrix_file = "filtered_variant_matrix_df.csv"
matches_file = "matched_variants_and_aliases.csv"
if not os.path.exists(matrix_file):
    print(f"\nError: The file '{matrix_file}' was not found. Please upload it.")
elif not os.path.exists(matches_file):
    print(f"\nError: The file '{matches_file}' was not found. Please upload it.")
else:
    final_variant_matrix_df = pd.read_csv(matrix_file, index_col=0, low_memory=False)
    matched_variants_df = pd.read_csv(matches_file)

In [None]:
# New normalization process

# Extract prefix and suffix for each column
extracted_column_variants = {}
column_suffixes = {}
for col in final_variant_matrix_df.columns:
    parts = col.rsplit('_', 1)
    if len(parts) == 2:
        extracted_column_variants[col] = parts[0]
        column_suffixes[col] = parts[1]
    else:
        extracted_column_variants[col] = col
        column_suffixes[col] = ""

original_column_count = final_variant_matrix_df.shape[1]

# Build merge dictionary and alias dictionary
merge_dict = {}
alias_dict = []
for _, row in matched_variants_df.iterrows():
    variant_name = str(row["Variant Name"]).strip() if pd.notna(row["Variant Name"]) else ""
    matched_aliases = []
    if pd.notna(row["Matched Aliases"]):
        matched_aliases = [alias.strip() for alias in str(row["Matched Aliases"]).split(",") if alias.strip()]

    adjusted_variant_name = variant_name.lower()
    adjusted_matched_aliases = [alias.lower() for alias in matched_aliases]

    actual_variant_col = [
        col for col, short_col in extracted_column_variants.items()
        if short_col.lower() == adjusted_variant_name
    ]
    actual_alias_cols = [
        col for col, short_col in extracted_column_variants.items()
        if short_col.lower() in adjusted_matched_aliases
    ]

    if actual_variant_col and actual_alias_cols:
        keep_col = actual_variant_col[0]
        merge_dict.setdefault(keep_col, []).extend(actual_alias_cols)
        for alias_col in actual_alias_cols:
            alias_dict.append({"Kept Name": keep_col, "Dropped Alias": alias_col})

# Merge candidates before suffix filtering
print("\nMerge Candidates Before Suffix Filtering:")
for keep_col, alias_cols in merge_dict.items():
    print(f"{keep_col}: {alias_cols}")

# Track columns to drop after merging
merged_columns = set()

# Merging alias columns with suffix filtering
print("\nMerging alias columns with suffix filtering...")
for keep_col in tqdm(merge_dict, desc="Merging columns"):
    merge_cols = [col for col in merge_dict[keep_col] if col in final_variant_matrix_df.columns and col != keep_col]

    if not merge_cols:
        continue

    keep_col_suffix = column_suffixes.get(keep_col, "")
    valid_merge_cols = [col for col in merge_cols if column_suffixes.get(col, "") == keep_col_suffix]

    if not valid_merge_cols:
        print(f"Skipping merge for {keep_col}: No alias matches suffix '{keep_col_suffix}'")
        continue

    final_variant_matrix_df[keep_col] = final_variant_matrix_df[[keep_col] + valid_merge_cols].max(axis=1)
    merged_columns.update(valid_merge_cols)
    print(f"Merged {valid_merge_cols} into {keep_col}")

# Drop merged alias columns
final_variant_matrix_df.drop(columns=merged_columns, inplace=True)

# Optional: Sort columns alphabetically
final_variant_matrix_df = final_variant_matrix_df.reindex(sorted(final_variant_matrix_df.columns), axis=1)

new_column_count = final_variant_matrix_df.shape[1]
columns_merged = original_column_count - new_column_count

# Save outputs
output_file = "normalized_merged_variant_matrix_v2.csv"
final_variant_matrix_df.to_csv(output_file)

alias_dict_df = pd.DataFrame(alias_dict)
alias_dict_filename = "alias_dictionary_v2.csv"
alias_dict_df.to_csv(alias_dict_filename, index=False)

# Final summary
print(f"\nMerged binary matrix saved as '{output_file}'")
print(f"Original column count: {original_column_count}")
print(f"New column count: {new_column_count}")
print(f"Total columns merged: {columns_merged}")
print(f"Alias dictionary saved to: {alias_dict_filename}")

# Normlaization #3

## Fetch only once and resume afterwards

In [None]:
# Connect to CliVar for HGSV notations
file_name = "CIVIC_merged_variants_molprofiles_genes.csv"
CIVIC_df = pd.read_csv(file_name)

# Extract rsID from "Variant Name" and "Aliases"
def extract_rsID(variant_name, aliases):
    """Extract rsID from 'Variant Name' or 'Aliases'."""
    rs_pattern = r"rs\d+"  # Pattern to find rsIDs (e.g., rs121434568)

    # Search in "Variant Name"
    if isinstance(variant_name, str):
        match = re.search(rs_pattern, variant_name, re.IGNORECASE)
        if match:
            return match.group(0).lower()  # Convert to lowercase

    # Search in "Aliases"
    if isinstance(aliases, str):
        matches = re.findall(rs_pattern, aliases, re.IGNORECASE)
        if matches:
            return matches[0].lower()  # Convert to lowercase

    return ""  # Return empty if no rsID found

# Apply function to extract rsID
tqdm.pandas()
CIVIC_df["rsID"] = CIVIC_df.progress_apply(
    lambda row: extract_rsID(row["Variant Name"], row["Aliases"]), axis=1
)

# Save updated file with rsID column
updated_file = "CIVIC_with_rsID.csv"
CIVIC_df.to_csv(updated_file, index=False)

print("\n--- Updated Data Sample with rsID (Lowercase) ---")
print(CIVIC_df.head())

# Get ClinVar ID
def get_clinvar_id(rsid):
    """Fetch ClinVar ID using rsID from ClinVar API."""
    if pd.isna(rsid) or rsid.strip() == "" or rsid.lower() == "nan":
        return None, None  # Keep empty

    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term={rsid}&retmode=json"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if "esearchresult" in data and "idlist" in data["esearchresult"]:
            id_list = data["esearchresult"]["idlist"]
            if len(id_list) > 0:
                return rsid, id_list[0]

    return rsid, None

# Get HGVS from ClinVar
def get_hgvs_from_clinvar_id(clinvar_id):
    """Retrieve HGVS notation from ClinVar using ClinVar ID."""
    if not clinvar_id:
        return None

    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={clinvar_id}&retmode=json"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if "result" in data and clinvar_id in data["result"]:
            record = data["result"][clinvar_id]

            # Extract HGVS notation
            hgvs_transcript = record.get("title", "").split(" ")[0] if "title" in record else None
            hgvs_protein = record.get("variation_name", "") if "variation_name" in record else None

            return f"{hgvs_transcript} | {hgvs_protein}" if hgvs_protein else hgvs_transcript

    return None

# Get HGVS from Ensembl (Backup)
def get_hgvs_from_ensembl(rsid):
    """Fetch HGVS notation from Ensembl if ClinVar is missing."""
    if pd.isna(rsid) or rsid.strip() == "" or rsid.lower() == "nan":
        return None  # Keep empty

    url = f"https://rest.ensembl.org/variant_recoder/human/{rsid}?content-type=application/json"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        hgvs_list = set()
        for variant in data:
            if "hgvsg" in variant:
                hgvs_list.add(variant["hgvsg"])

        return ", ".join(hgvs_list) if hgvs_list else None

    return None

# Fetch HGVS from ClinVar & Ensembl
def fetch_hgvs(rsid, clinvar_id):
    """First tries ClinVar, then Ensembl if ClinVar HGVS is missing."""
    if pd.isna(rsid) or rsid.strip() == "" or rsid.lower() == "nan":
        return None 
    clinvar_hgvs = get_hgvs_from_clinvar_id(clinvar_id)
    if not clinvar_hgvs:
        return get_hgvs_from_ensembl(rsid)
    return clinvar_hgvs

In [None]:
# Apply API calls to dataset
print("Fetching ClinVar IDs...")
tqdm.pandas()
CIVIC_df[["rsID_Confirmed", "ClinVar_ID"]] = \
    CIVIC_df["rsID"].progress_apply(lambda r: pd.Series(get_clinvar_id(str(r))))

print("Fetching HGVS Notation from ClinVar and Ensembl...")
CIVIC_df["HGVS_Notation"] = \
    CIVIC_df.apply(lambda row: fetch_hgvs(str(row["rsID"]), str(row["ClinVar_ID"])), axis=1)

# Extract variant description
def extract_variant_description(hgvs_notation):
    """Extracts the variant notation (e.g., c.944C>T) from the HGVS string."""
    if pd.isna(hgvs_notation):
        return None 
    parts = hgvs_notation.split("|")
    if len(parts) > 0:
        variant = parts[0].split(":")[-1].strip()
        return variant
    return None
CIVIC_df["Variant_Description"] = CIVIC_df["HGVS_Notation"].apply(extract_variant_description)

# Clean data
CIVIC_df["HGVS_Notation"] = CIVIC_df["HGVS_Notation"].str.replace(" | None", "", regex=False)
CIVIC_df["HGVS_Notation"] = CIVIC_df["HGVS_Notation"].replace({"None": "", "No HGVS available": ""})
CIVIC_ClinVar_df=CIVIC_df.copy()
final_file = "CIVIC_with_ClinVar_HGVS.csv"
CIVIC_ClinVar_df.to_csv(final_file, index=False)
print("\nProcessing Complete! Final File Saved:", final_file)
print("\n--- Final Updated Data Sample with HGVS & Variant Description ---")

In [None]:
# Merge aliases, HGVS and variant descirption for matching of columns
file_name = "CIVIC_with_ClinVar_HGVS.csv"
CIVIC_ClinVar_df = pd.read_csv(file_name)
required_columns = ["Aliases", "HGVS_Notation", "Variant_Description"]
existing_columns = [col for col in required_columns if col in CIVIC_ClinVar_df.columns]

if len(existing_columns) == len(required_columns):
    CIVIC_ClinVar_df["Aliases_Merged"] = CIVIC_ClinVar_df[["Aliases", "HGVS_Notation", "Variant_Description"]].apply(
        lambda row: ", ".join(row.dropna().astype(str)), axis=1
    )

    final_file = "CIVIC_ClinVar_merged.csv"
    CIVIC_ClinVar_df.to_csv(final_file, index=False)
    print("\n Processing Complete! Final File Saved:", final_file)
    print(f"\n Length of dataset: {len(CIVIC_ClinVar_df):,}")
    print("\n--- Sample of Updated Data with 'Aliases_Merged' ---")
    print(CIVIC_ClinVar_df[["Aliases", "HGVS_Notation", "Variant_Description", "Aliases_Merged"]].head())
else:
    print(" Error: Some required columns are missing in the dataset.")

# Resume here!

In [None]:
matrix_file = "normalized_merged_variant_matrix_v2.csv"
matches_file = "CIVIC_ClinVar_merged.csv"
if not os.path.exists(matrix_file):
    print(f"\nError: The file '{matrix_file}' was not found.")
elif not os.path.exists(matches_file):
    print(f"\nError: The file '{matches_file}' was not found.")
else:
    print("Loading datasets...")
    final_variant_matrix_df = pd.read_csv(matrix_file, index_col=0, low_memory=False)
    matched_variants_df = pd.read_csv(matches_file)

print("Length of dataset variant matrix",len(final_variant_matrix_df))
print("Length of datasets with matches",len(matched_variants_df))
print("Success!")

In [None]:
# Normalize and merge variant columns based on aliases and suffixes

# Extract prefix and suffix
extracted_column_variants = {}
column_suffixes = {}

for col in final_variant_matrix_df.columns:
    parts = col.rsplit('_', 1)
    if len(parts) == 2:
        extracted_column_variants[col] = parts[0]
        column_suffixes[col] = parts[1]
    else:
        extracted_column_variants[col] = col
        column_suffixes[col] = ""

# Prepare for merging aliases
merge_dict = {}
alias_dict = []

for _, row in matched_variants_df.iterrows():
    variant_name = str(row["Variant Name"]).strip() if pd.notna(row["Variant Name"]) else ""
    matched_aliases = [alias.strip() for alias in str(row["Aliases_Merged"]).split(",") if pd.notna(row["Aliases_Merged"])]
    
    adjusted_variant_name = variant_name.lower()
    adjusted_matched_aliases = [alias.lower() for alias in matched_aliases]

    actual_variant_col = [col for col, short_col in extracted_column_variants.items() if short_col.lower() == adjusted_variant_name]
    actual_alias_cols = [col for col, short_col in extracted_column_variants.items() if short_col.lower() in adjusted_matched_aliases]

    if actual_variant_col and actual_alias_cols:
        keep_col = actual_variant_col[0]
        merge_dict.setdefault(keep_col, []).extend(actual_alias_cols)

        for alias_col in actual_alias_cols:
            alias_dict.append({"Kept Name": keep_col, "Dropped Alias": alias_col})

# Merge alias columns
merged_columns = set()

for keep_col, merge_cols in merge_dict.items():
    merge_cols = [col for col in merge_cols if col in final_variant_matrix_df.columns]

    if not merge_cols:
        continue

    keep_col_suffix = column_suffixes.get(keep_col, "")
    valid_merge_cols = [col for col in merge_cols if column_suffixes.get(col, "") == keep_col_suffix]

    if not valid_merge_cols:
        print(f"Skipping merge for {keep_col}: No alias matches suffix '{keep_col_suffix}'")
        continue

    final_variant_matrix_df[keep_col] = final_variant_matrix_df[[keep_col] + valid_merge_cols].max(axis=1)
    merged_columns.update(valid_merge_cols)
    print(f"Merged {valid_merge_cols} into {keep_col}")

# Finalize the merged dataset
final_variant_matrix_df.drop(columns=merged_columns, inplace=True)
output_file = "normalized_merged_variant_matrix_v3.csv"
final_variant_matrix_df.to_csv(output_file)

alias_dict_df = pd.DataFrame(alias_dict)
alias_dict_filename = "alias_dictionary_v3.csv"
alias_dict_df.to_csv(alias_dict_filename, index=False)

# Print summary
print(f"\nMerged binary matrix saved as '{output_file}'!")

In [None]:
# Final output file: normalized_merged_variant_matrix_v3.csv

# =================================================

# Look at variant mention statistics

In [None]:
# Gene mentions statistics
os.chdir(output_directory)
cancer_df = pd.read_csv("binary_cancer_matrix_filtered.csv")  

os.chdir(LLM_directory)
normalized_merged_variant_matrix_v3 = pd.read_csv("normalized_merged_variant_matrix_v3.csv", index_col=0)
final_variant_matrix_df = normalized_merged_variant_matrix_v3

print("Length of cancer dataset:", len(cancer_df))
print("Columns of cancer dataset:", len(cancer_df.columns))

print("Final_variant_matrix_df loaded")
print("Length of matrix:", len(final_variant_matrix_df))
print("Columns of matrix:", len(final_variant_matrix_df.columns))

In [None]:
# Get the set of valid PaperIds from cancer_df
valid_paper_ids = set(cancer_df['PaperId'])

# Initial length of final_variant_matrix_df before filtering
initial_length = len(final_variant_matrix_df)
filtered_final_variant_matrix_df = final_variant_matrix_df.loc[final_variant_matrix_df.index.isin(valid_paper_ids)]
final_length = len(filtered_final_variant_matrix_df)
print(f"\nFiltering completed successfully.")
print(f"Initial number of rows in final_variant_matrix_df: {initial_length:,}")
print(f"Number of rows after filtering: {final_length:,}")
filtered_final_variant_matrix_df.to_csv("normalized_merged_variant_matrix_v4.csv")
print("\nFile 'normalized_merged_variant_matrix_v4.csv' has been successfully saved.")

In [None]:
# Create a copy of the original DataFrame to avoid overwriting
final_variant_matrix_df=filtered_final_variant_matrix_df.copy()
variant_matrix_with_counts = final_variant_matrix_df.copy()

if 'total_variant_count' in variant_matrix_with_counts.columns:
    variant_matrix_with_counts = variant_matrix_with_counts.drop(columns=['total_variant_count'])
    print("Existing 'total_variant_count' column removed from the new DataFrame.")

# Convert all columns except 'PaperId' to numeric, forcing errors to NaN
variant_matrix_with_counts_numeric = variant_matrix_with_counts.apply(pd.to_numeric, errors='coerce')

# Calculate the new 'total_variant_count' as the row-wise sum, ignoring non-numeric columns
variant_matrix_with_counts['total_variant_count'] = variant_matrix_with_counts_numeric.sum(axis=1, skipna=True)
print("New 'total_variant_count' column added to the new DataFrame.")

value_counts = variant_matrix_with_counts['total_variant_count'].value_counts().sort_index()

print("Counts of rows by 'total_variant_count' value:")
for value, count in value_counts.items():
    print(f"Rows with value {int(value)}: {count}")

In [None]:
# Total number of screened articles and variant hits percentage
screened_articles = len(cancer_df)
variant_hits_percentage = (len(final_variant_matrix_df) / screened_articles) * 100

print("Final_variant_matrix_df loaded")
print(f"Length of matrix: {len(final_variant_matrix_df):,}")
print(f"Total screened articles: {screened_articles:,}")
print(f"Percentage of variant hits: {variant_hits_percentage:.2f}%")

# Prepare data by excluding metadata columns and processing binary columns
metadata_columns = [
    "PaperId", "PaperTitle", "Abstract", "LLM_Prompt", "LLM_Response",
    "Variant_Gene_Pairs", "Cleaned_Variant_Gene_Pairs", "total_variant_count"
]
binary_columns = [col for col in final_variant_matrix_df.columns if col not in metadata_columns]

final_variant_matrix_df[binary_columns] = final_variant_matrix_df[binary_columns].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# Sum occurrences, remove rows with all '0's, and print stats
binary_matrix_sums = final_variant_matrix_df[binary_columns].sum()
total_binary_matrix_count = binary_matrix_sums.sum()

initial_row_count = len(final_variant_matrix_df)
final_variant_matrix_df = final_variant_matrix_df.loc[(final_variant_matrix_df[binary_columns].sum(axis=1) > 0)]
rows_with_only_zeros = initial_row_count - len(final_variant_matrix_df)

print(f"\nRows removed (containing only '0's across all columns): {rows_with_only_zeros:,}")
print(f"Remaining rows after removal: {len(final_variant_matrix_df):,}")

# Sort variants, display statistics, and show top 20 columns
sorted_binary_matrix_sums = binary_matrix_sums.sort_index(key=lambda x: x.str.len().astype(str) + x)

print("\n----------------- Binary Matrix Statistics -----------------")
print(f"Total binary matrix columns: {len(binary_columns):,}")
print(f"Total sum of all binary matrix counts (total 1s): {total_binary_matrix_count:,}")

top_20_columns = binary_matrix_sums.sort_values(ascending=False).head(20)
top_20_columns_percentage = (top_20_columns / len(final_variant_matrix_df)) * 100

print("\nTop 20 binary matrix columns by count and percentage:")
for column, count, percentage in zip(top_20_columns.index, top_20_columns, top_20_columns_percentage):
    print(f"{column}: {count} mentions, {percentage:.2f}%")

# List top 20 column names and save the data
print("\nTop 20 column names with the most counts:")
print(top_20_columns.index.tolist())

top_20_csv_path = f"{output_directory}/Top_20_Variants.csv"
top_20_df.to_csv(top_20_csv_path, index=False)
print(f"\nTop 20 Variants saved to: {top_20_csv_path}")

# Create figure

In [None]:
# Create a bar chart for the top 20 binary matrix variant mentions
plt.figure(figsize=(12, 8))
ax = top_20_columns.plot(kind='bar', color='#1f20b4')
plt.title('Top 20 Most Frequent Variants in the Dataset', fontsize=16)
plt.xlabel('Variants', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45, ha="right", fontsize=12)
plt.ylim(0, top_20_columns.max() + 500)
for i, v in enumerate(top_20_columns):
    ax.text(i, v + 5, f"{v:,}", color='#505050', ha='center', rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Plot top 20 most frequent variant-associated genes in the dataset
extracted_genes = binary_matrix_sums.index.to_series().apply(lambda x: re.split(r'_(?=[^_]+$)', x)[-1])
extracted_gene_counts = binary_matrix_sums.groupby(extracted_genes).sum().sort_values(ascending=False)
top_20_genes = extracted_gene_counts.head(20)
plt.figure(figsize=(12, 8))
ax = top_20_genes.plot(kind='bar', color='#1f77b4')
plt.title('Top 20 Most Frequent Variant-Associated Genes in the Dataset', fontsize=16)
plt.xlabel('Genes', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45, ha="right", fontsize=12)
plt.ylim(0, top_20_genes.max() + 500)
for i, v in enumerate(top_20_genes):
    ax.text(i, v + 100, f"{v:,}", color='#505050', ha='center', rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

# ============================================

# Oncomine gene comparison

In [None]:
print("Setting working directory...")
os.chdir(LLM_directory)
print("Current Working Directory:", os.getcwd())
print("Loading CSV data into DataFrame...")
matrix_file = "normalized_merged_variant_matrix_v4.csv"
final_variant_matrix_df = pd.read_csv(matrix_file, index_col=0, low_memory=False)
print("All datasets loaded!")

In [None]:
##### ========= Oncomine matching ========= #####

# Identify relevant columns
print("Identifying relevant columns...")
non_binary_fields = {
    "PaperId", "PaperTitle", "Abstract",
    "LLM_Prompt", "LLM_Response",
    "Variant_Gene_Pairs", "Cleaned_Variant_Gene_Pairs",
    "total_variant_count"
}

# Get all columns that are NOT in non_binary_fields
relevant_columns = [col for col in final_variant_matrix_df.columns if col not in non_binary_fields]

print("\n----------------- Relevant Columns -----------------")
print(f"Total relevant columns: {len(relevant_columns):,}")
print(relevant_columns[:10])

# Extract filtered matrix
print("Extracting relevant matrix columns...")
filtered_matrix = final_variant_matrix_df[relevant_columns]

# Extract gene names from column names
print("Extracting gene names from column names...")
extracted_gene_list = []
for col in relevant_columns:
    gene = col.split('_')[-1]
    extracted_gene_list.append(gene)
extracted_genes = pd.Series(extracted_gene_list, index=relevant_columns)

print(" Success: Gene names extracted and columns filtered.")
print("Summing extracted gene counts (optimized)...")

gene_counts = defaultdict(float)
for col in filtered_matrix.columns:
    gene = extracted_genes[col]
    try:
        col_sum = pd.to_numeric(filtered_matrix[col], errors="coerce").sum()
        gene_counts[gene] += col_sum
    except Exception as e:
        print(f" Skipping column {col} due to error: {e}")
extracted_gene_counts = pd.Series(gene_counts).fillna(0).sort_values(ascending=False)
total_extracted_gene_mentions = extracted_gene_counts.sum()
print("\n----------------- Extracted Gene Counts -----------------")
print(f"Total unique extracted genes: {len(extracted_gene_counts):,}")
print(f"Total extracted gene mentions: {total_extracted_gene_mentions:,}\n")
print(extracted_gene_counts.to_string())

In [None]:
# Gene matching and data integrity check with oncomine list
os.chdir(input_directory)
genes_file = "oncomine_ngs_panel.csv"
genes = pd.read_csv(genes_file, header=None)
gene_list = set(genes[0].tolist())
len_gene_list = len(gene_list)
print("Genes import successful!")
print(f"Number of Oncomine genes: {len(gene_list):,}")
os.chdir(LLM_directory)

# Compare extracted genes with oncomine
matching_genes = extracted_gene_counts[extracted_gene_counts.index.isin(gene_list)]
non_matching_genes = extracted_gene_counts[~extracted_gene_counts.index.isin(gene_list)]
num_matching_genes = len(matching_genes)
num_non_matching_genes = len(non_matching_genes)
matching_gene_names = list(matching_genes.index)
non_matching_gene_names = list(non_matching_genes.index)
total_matching_mentions = matching_genes.sum().sum()
total_non_matching_mentions = non_matching_genes.sum().sum()
total_extracted_genes = num_matching_genes + num_non_matching_genes
total_extracted_mentions = total_matching_mentions + total_non_matching_mentions
proportion_matching_genes = (num_matching_genes / total_extracted_genes) * 100 if total_extracted_genes > 0 else 0
proportion_non_matching_genes = (num_non_matching_genes / total_extracted_genes) * 100 if total_extracted_genes > 0 else 0
proportion_matching_mentions = (total_matching_mentions / total_extracted_mentions) * 100 if total_extracted_mentions > 0 else 0
proportion_non_matching_mentions = (total_non_matching_mentions / total_extracted_mentions) * 100 if total_extracted_mentions > 0 else 0

# Oncomine gene matching
print("\n----------------- Oncomine Gene Matching -----------------")
print(f"Total extracted genes: {total_extracted_genes:,}")
print(f" - Matching genes in Oncomine: {num_matching_genes:,} ({proportion_matching_genes:.2f}%)")
print(f" - Non-matching genes (not in Oncomine): {num_non_matching_genes:,} ({proportion_non_matching_genes:.2f}%)")
print(f"\nTotal extracted mentions: {total_extracted_mentions:,}")
print(f" - Total mentions for Oncomine genes: {total_matching_mentions:,} ({proportion_matching_mentions:.2f}%)")
print(f" - Total mentions for non-Oncomine genes: {total_non_matching_mentions:,} ({proportion_non_matching_mentions:.2f}%)")

# Data integrity check
gene_check = "Match!" if total_extracted_genes == (num_matching_genes + num_non_matching_genes) else "Warning: Mismatch!"
mention_check = "Match!" if total_extracted_mentions == (total_matching_mentions + total_non_matching_mentions) else "Warning: Mismatch!"
print("\n----------------- Data Integrity Check -----------------")
print(f"Sum of matching + non-matching genes: {num_matching_genes + num_non_matching_genes:,} (should be {total_extracted_genes:,}) {gene_check}")
print(f"Sum of matching + non-matching mentions: {total_matching_mentions + total_non_matching_mentions:,} (should be {total_extracted_mentions:,}) {mention_check}")


In [None]:
# Create bar chart: unique genes
fig, ax1 = plt.subplots(figsize=(8, 6))
categories = ["Matching (Oncomine)", "Non-Matching"]
gene_counts = [num_matching_genes, num_non_matching_genes]

ax1.bar(categories, gene_counts, color=["green", "blue"])
ax1.set_title(f"Oncomine matching of unique variant-associated genes\n"
              f"Total Oncomine genes: {len(gene_list):,}\n"
              f"Total extracted genes: {total_extracted_genes:,}",
              fontsize=14, pad=20)

ax1.set_ylim(0, max(gene_counts) * 1.2 if max(gene_counts) > 0 else 1)

for i, v in enumerate(gene_counts):
    if total_extracted_genes > 0:
        percentage = (v / total_extracted_genes) * 100
    else:
        percentage = 0
    ax1.text(i, v + (max(gene_counts) * 0.05 if max(gene_counts) > 0 else 0.5),
             f"{v:,} ({percentage:.2f}%)", ha='center', fontsize=10)

plt.show()
fig, ax2 = plt.subplots(figsize=(8, 6))
mention_counts = [total_matching_mentions, total_non_matching_mentions]
ax2.bar(categories, mention_counts, color=["green", "blue"])
ax2.set_title(f"Total mentions of variant-associated genes in articles\n"
              f"Total extracted mentions: {total_extracted_mentions:,}", 
              fontsize=14, pad=20)

ax2.set_ylim(0, max(mention_counts) * 1.2 if max(mention_counts) > 0 else 1)
for i, v in enumerate(mention_counts):
    if total_extracted_mentions > 0:
        percentage = (v / total_extracted_mentions) * 100
    else:
        percentage = 0
    ax2.text(i, v + (max(mention_counts) * 0.05 if max(mention_counts) > 0 else 0.5),
             f"{v:,} ({percentage:.2f}%)", ha='center', fontsize=10)
plt.show()

In [None]:
# Summary statistics and CSV creation for waterfall chart.

percent_oncomine_extracted = round((num_matching_genes / len_gene_list) * 100, 2) if len_gene_list > 0 else 0
percent_oncomine_extracted_of_all_extracted = round((num_matching_genes / total_extracted_genes) * 100, 2) if total_extracted_genes > 0 else 0
percent_other_extracted = round((num_non_matching_genes / total_extracted_genes) * 100, 2) if total_extracted_genes > 0 else 0
percent_oncomine_mentions = round((total_matching_mentions / total_extracted_mentions) * 100, 2) if total_extracted_mentions > 0 else 0
percent_other_mentions = round((total_non_matching_mentions / total_extracted_mentions) * 100, 2) if total_extracted_mentions > 0 else 0

sum_extraction_check = round(percent_oncomine_extracted + (100 - percent_oncomine_extracted), 2)
sum_extracted_total_check = round(percent_oncomine_extracted_of_all_extracted + percent_other_extracted, 2)
sum_mentions_check = round(percent_oncomine_mentions + percent_other_mentions, 2)

print("\nValidation Checks:")
print(f"Oncomine extraction breakdown is {sum_extraction_check:,.2f}%. This should be 100.00%.")
print(f"Total extracted genes breakdown is {sum_extracted_total_check:,.2f}%. This should be 100.00%.")
print(f"Total gene mentions breakdown is {sum_mentions_check:,.2f}%. This should be 100.00%.")

assert sum_extraction_check == 100.00, f"Extraction sum mismatch. Found {sum_extraction_check:,.2f}%."
assert sum_extracted_total_check == 100.00, f"Extracted sum mismatch. Found {sum_extracted_total_check:,.2f}%."
assert sum_mentions_check == 100.00, f"Mentions sum mismatch. Found {sum_mentions_check:,.2f}%."

print("All percentage calculations are correct.")

gene_list_sorted = sorted(gene_list)
matching_genes_sorted = sorted(matching_genes.index)  
non_matching_genes_sorted = sorted(non_matching_genes.index) 
oncomine_not_extracted_sorted = sorted(set(gene_list) - set(matching_genes.index))

oncomine_genes_str = ", ".join(gene_list_sorted)  
oncomine_not_extracted_str = ", ".join(oncomine_not_extracted_sorted)
oncomine_extracted_str = ", ".join(matching_genes_sorted) 
other_extracted_str = ", ".join(non_matching_genes_sorted) 

data = {
    "Category": [
        "Oncomine genes total", 
        "Oncomine genes not extracted", 
        "Oncomine genes extracted",
        "",  
        "Other genes extracted",
        "Oncomine genes extracted",  
        "Extracted unique genes",
        "",  
        "Oncomine gene mentions",  
        "Other gene mentions",
        "Total gene mentions"
    ],
    "Count": [
        f"{len_gene_list:,}",  
        f"{len_gene_list - num_matching_genes:,}",  
        f"{num_matching_genes:,}",  
        "",  
        f"{num_non_matching_genes:,}",  
        f"{num_matching_genes:,}",  
        f"{total_extracted_genes:,}",  
        "",  
        f"{total_matching_mentions:,}",  
        f"{total_non_matching_mentions:,}",  
        f"{total_extracted_mentions:,}"  
    ],
    "Percentage": [
        "100.00",  
        f"{round(100 - percent_oncomine_extracted, 2):,.2f}",  
        f"{percent_oncomine_extracted:,.2f}",  
        "",  
        f"{percent_other_extracted:,.2f}",  
        f"{percent_oncomine_extracted_of_all_extracted:,.2f}",  
        "100.00",  
        "",  
        f"{percent_oncomine_mentions:,.2f}",  
        f"{percent_other_mentions:,.2f}",  
        "100.00"  
    ],
    "Genes": [
        oncomine_genes_str,  
        oncomine_not_extracted_str,  
        oncomine_extracted_str,  
        "",  
        other_extracted_str,  
        oncomine_extracted_str,  
        oncomine_genes_str,  
        "",  
        "",  
        "",  
        ""  
    ]
}

df = pd.DataFrame(data)
csv_path = "oncomine_matched_gene_counts_summary.csv"
df.to_csv(csv_path, index=False)

print("\n----------------- Gene Counts Summary -----------------")
for index, row in df.iterrows():
    if row['Category'] == "":
        print("")
    else:
        print(f"{row['Category']:>35} {str(row['Count']):>10} {str(row['Percentage']):>10}")

print(f"\nCSV file has been saved as {csv_path}.")
print("\n\n\n----------- Print sorted gene names per category -------------")
print("\nOncomine genes in total.")
print(oncomine_genes_str)
print("\nOncomine genes not extracted.")
print(oncomine_not_extracted_str if oncomine_not_extracted_str else "None.")
print("\nOncomine genes extracted.")
print(oncomine_extracted_str if oncomine_extracted_str else "None.")
print("\nOther extracted genes (non-Oncomine).")
print(other_extracted_str if other_extracted_str else "None.")

In [None]:
# Define the data for the two tables to be saved
data_summary = {
    "Category": [
        "Oncomine genes total", 
        "Oncomine genes not extracted", 
        "Oncomine genes extracted",
        "Oncomine gene mentions",
        "Other gene mentions",
        "Total gene mentions"
    ],
    "Count": [
        len_gene_list,
        len_gene_list - num_matching_genes,
        num_matching_genes,
        total_matching_mentions,
        total_non_matching_mentions,
        total_extracted_mentions
    ],
    "Percentage": [
        100.00, 
        100 - percent_oncomine_extracted,
        percent_oncomine_extracted,
        percent_oncomine_mentions,
        percent_other_mentions,
        100.00  
    ]
}

df_summary = pd.DataFrame(data_summary)
csv_summary_path = "oncomine_gene_summary_stats_forfigure.csv"
df_summary.to_csv(csv_summary_path, index=False)
print(f"CSV file successfully saved as {csv_summary_path}")

In [None]:
# Investigate final dataset one mroe time
df = pd.read_csv("normalized_merged_variant_matrix_v4.csv")
total_rows = 199726
count_ge_1 = (df["total_variant_count"] >= 1).sum()
count_0 = 199726 - count_ge_1
percentage_ge_1 = (count_ge_1 / total_rows) * 100
percentage_0 = (count_0 / total_rows) * 100
print(f"Total number of rows: {total_rows:,}")
print(f"Rows where total_variant_count >= 1: {count_ge_1:,} ({percentage_ge_1:.2f}%)")
print(f"Rows where total_variant_count == 0: {count_0:,} ({percentage_0:.2f}%)")
if total_rows == (count_ge_1 + count_0):
    print("The counts add up correctly.")
else:
    print("The counts do NOT add up. There might be missing or NaN values in 'total_variant_count'.")