# String matching and NLP for variant extraction

# 1) Set up libraries and datasets

In [None]:
# Import libraries
import os
import re
import sys
import time
import requests
import numpy as np
import pandas as pd
import logging
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
from functools import reduce
from collections import Counter
from datetime import datetime, timedelta
print("Success!")

In [None]:
# Set the working directory and file paths
working_directory = "WORKING_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "BioBERT_file.csv"

os.chdir(output_directory)
if "full_articles" not in globals():
    full_articles = pd.read_csv(articles_file)
    print(f"Loaded {len(full_articles)} articles from CSV.")
else:
    print("Using preloaded full_articles from memory.")
articles = full_articles
print("Article import successful!")
print(f"\nImported {len(articles):,} articles with {len(articles.columns):,} selected columns.")

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]
os.chdir(working_directory)
print("\nCurrent Working Directory:", os.getcwd())

# 2) Run string-matching for variant extration

In [None]:
# CIViC GraphQL endpoint
url = "https://civicdb.org/api/graphql"

def run_query(query, variables=None):
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
    }
    response = requests.post(url, json={'query': query, 'variables': variables}, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Query failed with status code {response.status_code}")

all_features = []
end_cursor = None

browse_features_query = """
query ($after: String) {
  browseFeatures(first: 100, after: $after) {
    edges {
      node {
        id
        name
        fullName
        featureInstanceType
        featureAliases
        deprecated
      }
    }
    pageInfo {
      hasNextPage
      endCursor
    }
  }
}
"""

while True:
    variables = {"after": end_cursor}
    result = run_query(browse_features_query, variables)
    edges = result['data']['browseFeatures']['edges']
    all_features.extend([edge['node'] for edge in edges])
    
    page_info = result['data']['browseFeatures']['pageInfo']
    if page_info['hasNextPage']:
        end_cursor = page_info['endCursor']
    else:
        break

search_term = "ATM"
matching_features = []

for feature in all_features:
    name = feature.get("name", "").lower()
    aliases = [alias.lower() for alias in feature.get("featureAliases", [])]
    if search_term.lower() in name or search_term.lower() in aliases:
        matching_features.append(feature)

print(f"\nFound {len(matching_features)} matching feature(s) for '{search_term}':\n")
for f in matching_features:
    for key, val in f.items():
        print(f"{key}: {val}")
    print("-" * 40)

variant_query = """
query ($id: Int!) {
  feature(id: $id) {
    id
    name
    variants {
      id
      name
      description
      hgvsDescriptions
    }
  }
}
"""

for feature in matching_features:
    feature_id = feature["id"]
    feature_name = feature["name"]
    print(f"\nFetching variants for feature '{feature_name}' (ID: {feature_id})...\n")

    result = run_query(variant_query, {"id": feature_id})
    variants = result.get("data", {}).get("feature", {}).get("variants", [])

    if not variants:
        print("No variants found.")
    else:
        for v in variants:
            print(f"- Variant ID: {v['id']}")
            print(f"  Name: {v['name']}")
            print(f"  Description: {v.get('description', 'N/A')}")
            print(f"  HGVS Descriptions: {v.get('hgvsDescriptions', [])}")
            print("-" * 40)

In [None]:
variants_df = pd.read_csv("civic_gene_variants.csv", header=0)
print(variants_df)
print(articles)

# Manually define the terms for string match
custom_variant_terms = ["BRCA1"]
custom_variant_terms = [term.lower() for term in custom_variant_terms]
def has_custom_variant_match(row):
    title = str(row.get("PaperTitle", "")).lower()
    abstract = str(row.get("Abstract", "")).lower()
    return any(term in title or term in abstract for term in custom_variant_terms)

# Apply function and filter rows
articles_stringmatching_custom = articles[articles.apply(has_custom_variant_match, axis=1)]
num_matched_custom = len(articles_stringmatching_custom)
print(f"Number of matched articles (custom terms): {num_matched_custom}")

In [None]:
# Extract and clean variant terms, skipping NaNs
variant_terms = set()
for variants in variants_df["Variant"].dropna():
    split_variants = [v.strip().lower() for v in variants.split(",") if v.strip()]
    variant_terms.update(split_variants)

# Function to check for matches in PaperTitle or Abstract
def has_variant_match(row):
    title = str(row.get("PaperTitle", "")).lower()
    abstract = str(row.get("Abstract", "")).lower()
    return any(variant in title or variant in abstract for variant in variant_terms)

# Apply function and filter rows
articles_stringmatching_variants = articles[articles.apply(has_variant_match, axis=1)]

# Print how many matched
num_matched = len(articles_stringmatching_variants)
print(f"Number of matched articles: {num_matched}")