# String matching-based gene extraction

# 1) Install libraries and load dataset

In [None]:
import os
import re
import time
import pandas as pd
from tqdm import tqdm
from mygene import MyGeneInfo
import matplotlib.pyplot as plt
from fuzzywuzzy import process, fuzz
print("Import successful!")

In [None]:
# Set the working directory and file paths
working_directory = "WORKING_DIRECTORY"
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "articles.csv"
genes_file = "genes.csv"

# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
#Load reference genes file
os.chdir(input_directory)
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()
print("Genes import successful!")

# Load the articles file
os.chdir(output_directory)
if "full_articles" not in globals():
    full_articles = pd.read_csv(articles_file)
    print(f"Loaded {len(full_articles)} articles from CSV.")
else:
    print("Using preloaded full_articles from memory.")
articles = full_articles.head(100)
print("Article import successful!")
print(f"\nImported {len(articles)} articles with {len(articles.columns)} selected columns.")
print(f"Imported {len(gene_list):,} oncomine genes.")

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]
os.chdir(working_directory)
print("\nCurrent directory:", os.getcwd())

# Save datasubset for reference!
os.chdir(working_directory)
ICIMTH_output = "ICIMTH_subset_data_file_for_analysis.csv"
articles.to_csv(ICIMTH_output)
print(f"\n Subset dataset for analysis saved as '{ICIMTH_output}'")

# 2) Run string matching

In [None]:
gene_list = [gene.upper() for gene in gene_list]

def extract_genes_exact_match(text, gene_list):
    if pd.isna(text) or len(text.strip()) == 0:
        return set()

    matched_genes = set()
    text_upper = text.upper()

    for gene in gene_list:
        if re.search(rf"\b{re.escape(gene)}\b", text_upper):
            matched_genes.add(gene)

    return matched_genes

start_time = time.strftime("%Y-%m-%d %H:%M:%S")
start_timestamp = time.time()

print(f"Processing {len(articles)} articles using exact string matching. Started at {start_time}")

all_results = []

for index, row in tqdm(articles.iterrows(), total=len(articles), desc="Processing Articles"):
    title = row.get("PaperTitle", "")
    abstract = row.get("Abstract", "")

    extracted_entities = extract_genes_exact_match(title, gene_list) | extract_genes_exact_match(abstract, gene_list)

    all_results.append(", ".join(extracted_entities))

df_results = articles.copy()
df_results["String_Matching"] = all_results

df_results["Extracted_Entities"] = df_results["String_Matching"].apply(lambda x: x.split(", ") if isinstance(x, str) else [])
binary_entity_data = {gene: df_results["Extracted_Entities"].apply(lambda entities: 1 if gene in entities else 0) for gene in gene_list}
binary_entity_df = pd.DataFrame(binary_entity_data)
df_results = pd.concat([df_results, binary_entity_df], axis=1)
df_results["Sum_Entity_Mentions"] = binary_entity_df.sum(axis=1)

binary_output_file = "string_matching_entity_binary_matrix.csv"
df_results.drop(columns=["Extracted_Entities"], errors="ignore").to_csv(binary_output_file, index=False)
print(f"Binary matrix saved as: {binary_output_file}")

summary_results = df_results["Sum_Entity_Mentions"].sum()
print(f"\n### Extraction Summary ###\nTotal Gene Mentions: {summary_results}")

summary_file = "string_matching_entity_extraction_summary.txt"
with open(summary_file, "w") as f:
    f.write(f"### Extraction Summary ###\nTotal Gene Mentions: {summary_results}\n")

print(f"Extraction summary saved in: {summary_file}")

evaluation_results_file = "string_matching_evaluation_results.csv"
df_results.drop(columns=["Extracted_Entities"], errors="ignore").to_csv(evaluation_results_file, index=False)
print(f"Evaluation results saved in: {evaluation_results_file}")