# String matching of genes with predefiend list (i.e., Oncomine v3)

## 1) Install libraries and load datasets

In [None]:
import pandas as pd
import os
import re
import time
from tqdm import tqdm
import time
print("Success!")

In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "ARTICLE_FILE_OPENALEX.csv"
genes_file = "GENE_FILE.csv"
os.chdir(input_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
# Load the articles file
articles = pd.read_csv(articles_file)

# Load the gene file
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()

In [None]:
# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]
print(f"The articles df contains {num_rows:,} rows and {num_columns:,} columns.")
column_names = articles.columns.tolist()
print("The column names in the articles df are:")
for col in column_names:
    print(col)

## 2) Perform string matching

In [None]:
######## String matching of genes ########
start_time = time.time()

# Create a DataFrame with all gene columns set to 0
gene_columns_df = pd.DataFrame(0, index=articles.index, columns=gene_list)

# Concatenate the gene columns to the original DataFrame
articles = pd.concat([articles, gene_columns_df], axis=1)

# Perform string matching on "PaperTitle" and "Abstract"
for index, row in tqdm(articles.iterrows(), total=len(articles), desc="Processing rows"):
    title = row.get("PaperTitle", "")
    abstract = row.get("Abstract", "")
    text = f"{title} {abstract}"

    for gene in gene_list:
        pattern = r'\b' + re.escape(gene) + r'\b' 
        if re.search(pattern, text): 
            articles.at[index, gene] = 1

# Add a column to store gene names where matches are found
def find_matching_genes(row):
    matched_genes = [gene for gene in gene_list if row[gene] == 1]
    return ", ".join(matched_genes)

articles['MatchedGenes'] = articles.apply(find_matching_genes, axis=1)

# Add a column to count the total number of matched genes
articles['MatchedGeneCount'] = articles[gene_list].sum(axis=1)

# Filter rows where at least one gene match was found
filtered_articles = articles[articles['MatchedGeneCount'] > 0]

# Stop the timer and calculate total runtime
end_time = time.time()
runtime = end_time - start_time

print(filtered_articles[['PaperTitle', 'Abstract', 'MatchedGeneCount', 'MatchedGenes']])
print(f"Total runtime: {runtime:.2f} seconds")

# Save the filtered result to a file
filtered_articles.to_csv("filtered_output_with_gene_names_and_counts_full_dataset.csv", index=False)
print("Filtered output with matched gene names and counts saved to 'filtered_output_with_gene_names_and_counts_full_dataset.csv'.")

In [None]:
# Total number of rows in the original dataset
total_articles = len(articles)

# Count rows where at least one gene was matched
matched_articles = len(articles[articles['MatchedGeneCount'] > 0])

# Summarize the distribution of MatchedGeneCount
gene_count_summary = articles['MatchedGeneCount'].value_counts().sort_index()

# Filter for rows where MatchedGeneCount is between 1 and 20
summary_limited = gene_count_summary.loc[1:40]

print(f"Total number of articles in the original dataset: {total_articles}")
print(f"Number of articles with at least one gene match: {matched_articles}")
print("\nDistribution of MatchedGeneCount (1-40 genes per article):")
print(summary_limited)