# String matching of Oncomine Genes

## 1) Install libraries and load datasets

In [7]:
import pandas as pd
import os
import re
import time
from tqdm import tqdm
import time

print("Success!")

Success!


In [13]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "ARTICLE_FILE_OPENALEX.csv"
genes_file = "GENE_FILE.csv"

# Change the working directory
os.chdir(input_directory)
print("Current Working Directory:", os.getcwd())

In [9]:
# Load the articles file
articles = pd.read_csv(articles_file)

# Load the genes file
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()

In [10]:
# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

# Get the number of rows and columns
num_rows = articles.shape[0]
num_columns = articles.shape[1]

print(f"The articles df contains {num_rows:,} rows and {num_columns:,} columns.")
column_names = articles.columns.tolist()

print("The column names in the articles df are:")
for col in column_names:
    print(col)

Current Working Directory: /data/JH/marie/TrendyVariants/Output
The articles df contains 2,775,913 rows and 9 columns.
The column names in the articles df are:
PaperId
PaperTitle
Citations
c
Authors
Abstract
Language
PubYear
PubDate


## 2) Perform string matching

In [43]:
######## String matching of genes ########

# Start the timer
start_time = time.time()

# Create a DataFrame with all gene columns set to 0
gene_columns_df = pd.DataFrame(0, index=articles.index, columns=gene_list)

# Concatenate the gene columns to the original DataFrame
articles = pd.concat([articles, gene_columns_df], axis=1)

# Perform string matching on "PaperTitle" and "Abstract"
for index, row in tqdm(articles.iterrows(), total=len(articles), desc="Processing rows"):
    title = row.get("PaperTitle", "")
    abstract = row.get("Abstract", "")
    text = f"{title} {abstract}"

    for gene in gene_list:
        pattern = r'\b' + re.escape(gene) + r'\b' 
        if re.search(pattern, text): 
            articles.at[index, gene] = 1

# Add a column to store gene names where matches are found
def find_matching_genes(row):
    matched_genes = [gene for gene in gene_list if row[gene] == 1]
    return ", ".join(matched_genes)

articles['MatchedGenes'] = articles.apply(find_matching_genes, axis=1)

# Add a column to count the total number of matched genes
articles['MatchedGeneCount'] = articles[gene_list].sum(axis=1)

# Filter rows where at least one gene match was found
filtered_articles = articles[articles['MatchedGeneCount'] > 0]

# Stop the timer and calculate total runtime
end_time = time.time()
runtime = end_time - start_time

print(filtered_articles[['PaperTitle', 'Abstract', 'MatchedGeneCount', 'MatchedGenes']])
print(f"Total runtime: {runtime:.2f} seconds")

# Save the filtered result to a file
filtered_articles.to_csv("filtered_output_with_gene_names_and_counts_full_dataset.csv", index=False)
print("Filtered output with matched gene names and counts saved to 'filtered_output_with_gene_names_and_counts_full_dataset.csv'.")

Processing rows: 100%|█████████████████████████████████████████████████████████████████| 2775913/2775913 [2:47:26<00:00, 276.31it/s]


                                                PaperTitle  \
13       Tissue Prior to the Initial Hematoxylin-Eosin ...   
14       Detection of urine circulating tumor DNA using...   
16       In vitro characterization of some of the anti-...   
17       Evaluation of a Novel PLGA-HA-Based Drug Deliv...   
30       Network Pharmacology and Molecular Docking Stu...   
...                                                    ...   
2775813  MYCN-dependent expression of sulfatase-2 regul...   
2775836                                         EML4-ALK--   
2775837  GLI-IKBKE Requirement In KRAS-InducedPancreati...   
2775856        Novel Therapeutic Strategies in Lung Cancer   
2775879  Functional analysis of mutations in isocitrate...   

                                                  Abstract  MatchedGeneCount  \
13       Small biopsies are used for histologic, immuno...                 1   
14       Aim: This study aims to evaluate the feasibili...                 2   
16       Worldw

In [46]:
# Total number of rows in the original dataset
total_articles = len(articles)

# Count rows where at least one gene was matched
matched_articles = len(articles[articles['MatchedGeneCount'] > 0])

# Summarize the distribution of MatchedGeneCount
gene_count_summary = articles['MatchedGeneCount'].value_counts().sort_index()

# Filter for rows where MatchedGeneCount is between 1 and 20
summary_limited = gene_count_summary.loc[1:40]

print(f"Total number of articles in the original dataset: {total_articles}")
print(f"Number of articles with at least one gene match: {matched_articles}")
print("\nDistribution of MatchedGeneCount (1-40 genes per article):")
print(summary_limited)

Total number of articles in the original dataset: 2775913
Number of articles with at least one gene match: 285722

Distribution of MatchedGeneCount (1-40 genes per article):
MatchedGeneCount
1     192248
2      51841
3      18424
4       9300
5       4823
6       2764
7       1888
8       1317
9        851
10       624
11       441
12       298
13       240
14       183
15       112
16       106
17        79
18        51
19        32
20        13
21        17
22        17
23        11
24         6
25         3
26         2
27         2
28         6
29         4
30         1
31         4
32         1
33         4
36         1
38         1
Name: count, dtype: int64
