# String matching and NLP for variant extraction

# 1) Set up libraries and datasets

In [None]:
# Import libraries
import os
import re
import sys
import time
import requests
import numpy as np
import pandas as pd
import logging
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
from functools import reduce
from collections import Counter
from datetime import datetime, timedelta
print("Success!")

In [None]:
# Set the working directory and file paths
working_directory = "WORKING_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "BioBERT_file.csv"

os.chdir(output_directory)
if "full_articles" not in globals():
    full_articles = pd.read_csv(articles_file)
    print(f"Loaded {len(full_articles)} articles from CSV.")
else:
    print("Using preloaded full_articles from memory.")
articles = full_articles
print("Article import successful!")
print(f"\nImported {len(articles):,} articles with {len(articles.columns):,} selected columns.")

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]
os.chdir(working_directory)
print("\nCurrent Working Directory:", os.getcwd())

# 2) Run NLP approaches for variant extraction

In [None]:
import spacy
nlp = spacy.load("en_ner_bionlp13cg_md")
text = "The variant R175H in TP53 and rs5643728 were observed in breast cancer patients."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
# Define relevant labels
tqdm.pandas()
relevant_labels = {"AMINO_ACID", "GENE_OR_GENE_PRODUCT"}
def extract_relevant_entities(text):
    doc = nlp(str(text))
    return [ent.text for ent in doc.ents if ent.label_ in relevant_labels]

# Apply NER with progress bar
articles["TitleEntities"] = articles["PaperTitle"].progress_apply(extract_relevant_entities)
articles["AbstractEntities"] = articles["Abstract"].progress_apply(extract_relevant_entities)
def has_any_entity(row):
    return bool(row["TitleEntities"] or row["AbstractEntities"])
articles_with_entities = articles[articles.progress_apply(has_any_entity, axis=1)]
print(f"Number of articles with relevant entities: {len(articles_with_entities)}")

def extract_real_variants(text):
    text = str(text)
    variants = []
    variants += re.findall(r'rs\d+', text)                     
    variants += re.findall(r'\b[A-Z]\d+[A-Z]\b', text)      
    return list(set(variants)) 
articles["TitleVariants"] = articles["PaperTitle"].progress_apply(extract_real_variants)
articles["AbstractVariants"] = articles["Abstract"].progress_apply(extract_real_variants)

# Clean RealVariants columns to remove empty lists and replace with "No variant"
def clean_variants(row):
    if len(row["TitleVariants"]) == 0 and len(row["AbstractVariants"]) == 0:
        return "No variant"
    else:
        return ", ".join(row["TitleVariants"] + row["AbstractVariants"])

articles["AllVariants"] = articles.apply(lambda row: clean_variants(row), axis=1)
articles.rename(columns={"AllVariants": "NLP"}, inplace=True)
columns_to_save = ["PaperId", "PaperTitle", "TitleEntities", "Abstract", "AbstractEntities", "NLP"]
articles_with_entities.to_csv("articles_with_extracted_entities_and_variants.csv", index=False, columns=columns_to_save)
print("Saved to 'articles_with_extracted_entities_and_variants.csv'")

In [None]:
print(articles_with_entities)