In [32]:
import requests

def download_gene_symbols(output_filepath = "data/raw/gene_info.gz", url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", verbose=0):
    response = requests.get(url)
    response.raise_for_status()

    with open(output_filepath, "wb") as file:
        file.write(response.content)

    msginfo(verbose, "Download completed.")

def extract_gene_data(filepath = "data/raw/gene_info.gz"):
    """ returns 3 sets """
    import gzip

    gene_symbols = set()
    dbxrefs = set()
    gene_synonyms = set()

    with gzip.open(filepath, "rt") as file:
        for line in file:
            if not line.startswith("#"):
                fields = line.strip().split("\t")
                gene_symbols.add(fields[2])

                # Extract dbXrefs
                dbxref_field = fields[5]
                if dbxref_field != "-":
                    identifiers = dbxref_field.split("|")
                    dbxrefs.update(identifiers)

                # Extract gene synonyms
                synonym_field = fields[4]
                if synonym_field != "-":
                    synonyms = synonym_field.split("|")
                    gene_synonyms.update(synonyms)

    return gene_symbols, dbxrefs, gene_synonyms

In [3]:
def read_search_stop(search_file, stop_words):
    # Load search terms from the file into a set
    search_terms=[]
    with open(search_file, 'r') as f:
        search_terms = [line.strip() for line in f]
    return search_terms

def filter_search_terms(search_terms, stop_words):
    filtered_terms = []
    matched_stop_words = []
    
    # get the matched stop words and keep in original case
    for term in search_terms:
        if term.lower() in stop_words:
            matched_stop_words.append(term)

    # get the filtered terms, lower case them
    filtered_terms = [term.lower() for term in search_terms if term.lower() not in stop_words]

    # add the two lists together so the matched terms retain original case
    # this will allow for finding genes that are also common english words
    final_terms = filtered_terms + matched_stop_words
    return (final_terms, filtered_terms, matched_stop_words)


In [5]:
from typing import List, Dict, Set

import nltk
from nltk import FreqDist
import string

def fetch_brown_corpus():
    from nltk.corpus import brown
    corpus = 'brown'
    # Select a specific category from the Brown Corpus for analysis
    category = 'learned'
    # Load the Brown Corpus
    nltk.download('brown')
    # Get the words from the selected category
    words = brown.words(categories=category)
    return words

def read_search_stop(search_file, stop_words):
    # Load search terms from the file into a set
    search_terms=[]
    with open(search_file, 'r') as f:
        search_terms = [line.strip() for line in f]
    return search_terms

def filter_search_terms(search_terms, stop_words):
    filtered_terms = []
    matched_stop_words = []
    
    # get the matched stop words and keep in original case
    for term in search_terms:
        if term.lower() in stop_words:
            matched_stop_words.append(term)

    # get the filtered terms, lower case them
    filtered_terms = [term.lower() for term in search_terms if term.lower() not in stop_words]

    # add the two lists together so the matched terms retain original case
    # this will allow for finding genes that are also common english words
    final_terms = filtered_terms + matched_stop_words
    return (final_terms, filtered_terms, matched_stop_words)

def create_stop_words(frequency_list_outpath, custom_words) -> Set:
    from nltk.corpus import stopwords

    # Download the stopwords corpus
    nltk.download('stopwords')

    # Load the existing stop words
    stop_words = set(stopwords.words('english'))

    # Read the words from the frequency_list.txt file
    with open(frequency_list_outpath, 'r') as file:
        frequency_words = file.read().splitlines()

    # Add the frequency words to the stop words set
    stop_words.update(frequency_words)

    # Add custom words
    stop_words.update(custom_words)
    return stop_words


In [93]:
import os
import subprocess
import urllib.request
from urllib.error import URLError
from shutil import rmtree

def check_disk_space(predicted_size, download_dir, verbose):
    required_space = predicted_size
    available_space = os.statvfs(download_dir).f_frsize * os.statvfs(download_dir).f_bavail
    required_space_human = subprocess.check_output(['numfmt', '--to=iec-i', '--suffix=B', str(required_space)]).decode().strip()
    available_space_human = subprocess.check_output(['numfmt', '--to=iec-i', '--suffix=B', str(available_space)]).decode().strip()

    msg1(verbose, f"Predicted download size = {required_space_human}, Available space = {available_space_human}")

    if required_space > available_space:
        print(f"Insufficient disk space! Required: {required_space_human}, Available: {available_space_human}")
        exit(1)

def download_file(url, file_path, verbose):
    try:
        urllib.request.urlretrieve(url, file_path)
    except URLError as e:
        msg1(verbose, f"Error downloading file: {url}")
        msg1(verbose, f"Reason: {str(e.reason)}")
        #exit(1)

def verify_md5(file_path, md5_file_path, verbose):
    try:
        output = subprocess.check_output(['md5sum', '-c', os.path.basename(md5_file_path)], cwd=os.path.dirname(md5_file_path), stderr=subprocess.DEVNULL).decode()
        #output = subprocess.check_output(['md5sum', '-c', md5_file_path], stderr=subprocess.DEVNULL).decode()
        if "OK" in output:
            msg2(verbose, f"{md5_file_path}: OK - MD5 checksum verification succeeded.")
        else:
            msg1(verbose, f"ERROR: {md5_file_path}: FAILED - MD5 checksum verification failed.")
    except subprocess.CalledProcessError:
        msg1(verbose, f"ERROR: {md5_file_path}: FAILED - MD5 checksum verification failed.")


In [6]:
import pandas as pd
import numpy as np
import re

def extract_data(element):
    data = {}
    data['PMID'] = element.findtext('MedlineCitation/PMID')
    data['Title'] = element.findtext('MedlineCitation/Article/ArticleTitle')
    data['Abstract'] = element.findtext('MedlineCitation/Article/Abstract/AbstractText')
    data['Journal'] = element.findtext('MedlineCitation/Article/Journal/Title')
    data['PublicationDate'] = element.findtext('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
    data['JournalTitle'] = element.findtext('MedlineCitation/Article/Journal/Title')
    data['ArticleType'] = element.findtext('MedlineCitation/Article/PublicationTypeList/PublicationType')
    
    # Extract the descriptor names and qualifier names from the XML
    mesh_headings = element.findall('.//MeshHeading')
    mesh_heading_list = []
    for heading in mesh_headings:
        descriptor_name = heading.findtext('DescriptorName')
        qualifier_names = [qualifier.text for qualifier in heading.findall('QualifierName')]
        mesh_heading_list.append(descriptor_name)
        mesh_heading_list.extend(qualifier_names)
    data['MeshHeadingList'] = ','.join(mesh_heading_list)
                                        
    publication_types = element.findall('MedlineCitation/Article/PublicationTypeList/PublicationType')
    data['PublicationTypeList'] = ",".join([ptype.text for ptype in publication_types])
    
    return data

def prune_df(df, length_threshold = 405, verbose=2):
    # exclude articles with no abstract, no date, or abstracts that are too short (less than length_threshold letters)
    pruned_df = df[df['Abstract'].notna() & df['PublicationDate'].notna()]

    # cut out any short articles
    all_pruned = len(pruned_df)
    msg2(verbose, f"Number of all abstracts before pruning short articles = {all_pruned}")
    pruned_df = pruned_df[pruned_df['Abstract'].str.len() >= length_threshold]
    long_pruned = len(pruned_df)
    msg2(verbose, f"Number after pruning short articles = {long_pruned}")
    msg2(verbose, f"Number discarded for being too short: {all_pruned - long_pruned}")

    return pruned_df

def get_pub_df(filename, inpath, outpath, length_threshold, prune=True,verbose=0):
    import gzip
    import xml.etree.ElementTree as ET
    import pandas as pd

    pubmed_filepath = os.path.join(inpath, filename)
    # Open the gzip'd XML file
    with gzip.open(pubmed_filepath, 'rb') as f:
        # Read the contents of the gzip'd file
        gzip_content = f.read()

    # Parse the XML content using ElementTree
    root = ET.fromstring(gzip_content)

    # Extract data from each article and store in a list
    articles = []
    for article in root.findall('.//PubmedArticle'):
        articles.append(extract_data(article))

    # Create a DataFrame from the list of articles
    df = pd.DataFrame(articles)
    df = df.drop_duplicates()
    if prune:
        msg2(verbose, f"Number of all articles:{len(df)}")
        df = prune_df(df, length_threshold = length_threshold, verbose=verbose)
        msg2(verbose, f"Number of pruned articles:{len(df)}")
    
    # convert objects to simple types
    df['PublicationDate'] = df['PublicationDate'].astype(int)

    return df


In [95]:
def create_gene_reference_data(m: ReferenceData):
    
    raw_gene_info_filepath = os.path.join(m.raw_path(), m.gene_info_filename)
    reference_gene_symbols_filepath = os.path.join(m.reference_path(), m.gene_symbols_filename)
    reference_gene_synonyms_filepath = os.path.join(m.reference_path(), m.gene_synonyms_filename)
    dbxref_path = m.dbxref_path()
    verbose = m.verbose
    url = m.ncbi_gene_info_url
    
    # Download the gene symbols file
    download_gene_symbols(output_filepath = raw_gene_info_filepath, url = url, verbose = verbose)

    # Extract gene data
    gene_symbols, dbxrefs, gene_synonyms = extract_gene_data(filepath = raw_gene_info_filepath)

    # Save gene symbols to a file
    with open(reference_gene_symbols_filepath, "w") as file:
        for symbol in gene_symbols:
            file.write(symbol + "\n")

    msg2(verbose, f"Gene symbols saved to {reference_gene_symbols_filepath}")

    # Save dbXrefs to separate files
    for identifier in dbxrefs:
            identifier_parts = identifier.split(":")
            identifier_type = identifier_parts[0].replace('/','_')
            identifier_value = ":".join(identifier_parts[1:])
            filename = f"{dbxref_path}/{identifier_type}.txt"
            with open(filename, "a") as file:
                file.write(identifier_value + "\n")

    """
    for identifiers in dbxrefs:
        for identifier in identifiers:
            identifier_parts = identifier.split(":")
            identifier_type = identifier_parts[0].replace('/','_')
            identifier_value = ":".join(identifier_parts[1:])
            filename = f"{dbxref_path}/{identifier_type}.txt"
            with open(filename, "a") as file:
                file.write(identifier_value + "\n")
    """
    msg2(verbose, "dbXrefs saved to individual files.")

    # Save gene synonyms to a file
    with open(reference_gene_synonyms_filepath, "w") as file:
        for synonym in gene_synonyms:
            file.write(synonym + "\n")

    msg2(verbose, f"Gene synonyms saved to {reference_gene_synonyms_filepath}")

def create_frequency_list(m: ReferenceData) -> List:
    
    frequency_list_outpath = os.path.join(m.search_path(), m.frequency_list_filename)
    stop_word_list_length = m.corpus_stop_word_list_length
    verbose = m.verbose
    
    import nltk
    from nltk import FreqDist
    import string

    words = fetch_brown_corpus()

    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word not in string.punctuation and word.isalnum()]

    # Compute the frequency distribution of words
    freq_dist = FreqDist(words)

    # Get the most frequent words
    most_common_words = freq_dist.most_common(stop_word_list_length)

    # Write the frequency list to a file
    with open(frequency_list_outpath, 'w') as file:
        for word, frequency in most_common_words:
            file.write(word + '\n')
    msg2(verbose, f"Wrote {frequency_list_outpath}")
    return most_common_words

def create_search_terms_file(m: ReferenceData):
    import os

    dbxrefs = m.dbxrefs
    dbxrefs_path = m.dbxref_path()  
    gene_symbols_filepath = os.path.join(m.reference_path(), m.gene_symbols_filename)
    gene_synonyms_filepath = os.path.join(m.reference_path(), m.gene_synonyms_filename)
    search_terms_filepath = os.path.join(m.search_path(), m.search_terms_filename)
    verbose = m.verbose

    if dbxrefs == []:
        # Get a list of all files in the directory
        dbxrefs = os.listdir(dbxrefs_path)
        # Filter out directories from the list
        dbxrefs = [f for f in dbxrefs if os.path.isfile(os.path.join(dbxrefs_path, f))]
  
    
    with open(f"{search_terms_filepath}.unsorted", "w") as outfile:
        for ref in dbxrefs:
            with open(os.path.join(dbxrefs_path, ref)) as infile:
                sorted_lines = sorted(set(infile.readlines()))
                outfile.writelines(sorted_lines)
                #outfile.write(infile.read())

        with open(gene_symbols_filepath) as infile:
            sorted_lines = sorted(set(infile.readlines()))
            outfile.writelines(sorted_lines)
            #outfile.write(infile.read())

        with open(gene_synonyms_filepath) as infile:
            sorted_lines = sorted(set(infile.readlines()))
            outfile.writelines(sorted_lines)
            #outfile.write(infile.read())

    # Sort and remove duplicates from the search terms file
    search_terms_unsorted_filepath = f"{search_terms_filepath}.unsorted"
    os.system(f"sort -u {search_terms_unsorted_filepath} | grep -v not > {search_terms_filepath}")

    msg2(verbose, f"Created {search_terms_filepath}.")
    msg2(verbose, f"Created {search_terms_unsorted_filepath} - can be removed.")
    line_count = sum(1 for line in open(search_terms_filepath))
    msg2(verbose, f"Number of lines in {search_terms_filepath}: {line_count}")

def create_filtered_search_terms(m: ReferenceData) -> List:
    
    search_file = os.path.join(m.search_path(), m.search_terms_filename)
    frequency_list_outpath = os.path.join(m.search_path(), m.frequency_list_filename)
    custom_words = m.custom_stop_words
    final_file = os.path.join(m.search_path(), m.filtered_terms_filename)
    verbose = m.verbose
    
    stop_words = create_stop_words(frequency_list_outpath, custom_words)
    
    search_terms = read_search_stop(search_file = search_file, stop_words = stop_words)
    msg2(verbose, f"Number of original search_terms:{len(search_terms)}")
    final_terms, filtered_terms, matched_stop_words = filter_search_terms(search_terms, stop_words)
    msg2(verbose, f"number of filtered_terms:{len(filtered_terms)}\nfinal number of final_terms:{len(final_terms)}\n number of matched_stop_words:{len(matched_stop_words)}\nmatched_stop_words={matched_stop_words}")
    if final_file is not None:
        with open(final_file, "w") as f:
            f.writelines('\n'.join(final_terms))
    msg2(verbose, f"Created {final_file}")
    return final_terms


def fetch_abstracts(m: ReferenceData):
    
    num_files = m.num_abstract_xml_files
    refresh = m.refresh_abstract_xml_files
    download_dir = m.pub_inpath()
    verbose = m.verbose
    
    """ This can probably be done faster with download_files.sh """ 
    msg2(verbose, f"Download Directory: {download_dir}")
    msg2(verbose, f"Number of abstracts to ensure have been downloaded: {num_files}")
    msg2(verbose, f"Refresh: {refresh}")

    # FTP settings
    ftp_host = "ftp.ncbi.nlm.nih.gov"
    ftp_path = "/pubmed/baseline/"

    # Retrieve file names and find the largest number
    #file_list = subprocess.check_output(['curl', '-s', f"ftp://{ftp_host}{ftp_path}"]).decode().splitlines()
    
    output = subprocess.check_output(['curl', '-s', f"ftp://{ftp_host}{ftp_path}"]).decode()
    file_list = [line.split()[-1] for line in output.splitlines() if line.endswith(".xml.gz")]

    msg2(verbose, f"Total number of NCBI abstract XML files: {len(file_list)}")
    latest_files = [file_name for file_name in file_list if file_name.startswith("pubmed23n") and file_name.endswith(".xml.gz")]
    latest_files.sort(reverse=True)
    latest_files = latest_files[:num_files]
    msg2(verbose, f"latest_files {num_files}: {latest_files}")

    # Check if enough files are available
    if len(latest_files) == 0:
        msg1(verbose, "Error: Insufficient number of files available!")
        exit(1)

    # Calculate total predicted size
    total_size = 0
    for file_name in latest_files:
        response = subprocess.check_output(['curl', '-sI', f"ftp://{ftp_host}{ftp_path}{file_name}"]).decode()
        file_size = int(response.split("Content-Length: ")[1].split("\r")[0])
        total_size += file_size

    # Check disk space before downloading
    check_disk_space(total_size, download_dir, verbose=verbose)

    # Download and check files
    for file_name in latest_files:
        md5_file_name = f"{file_name}.md5"
        file_path = os.path.join(download_dir, file_name)
        md5_file_path = os.path.join(download_dir, md5_file_name)

        # Refresh files that were previously downloaded?
        if not refresh:
            # No, so skip downloading those again

            # If one file or the other is missing, you still have to do a download
            # Here, just provide information as to which files are present.
            if os.path.isfile(file_path) and not os.path.isfile(md5_file_path):
                msg1(verbose, f"ERROR: Missing - {md5_file_path}; re-downloading now")
            if not os.path.isfile(file_path) and os.path.isfile(md5_file_path):
                msg1(verbose, f"ERROR: Missing - {file_path}; re-downloading now")

            if os.path.isfile(file_path) and os.path.isfile(md5_file_path):
                msg1(verbose, f"SKIP: {file_path} exists.")
                continue

        # Check file size
        response = subprocess.check_output(['curl', '-sI', f"ftp://{ftp_host}{ftp_path}{file_name}"]).decode()
        file_size = int(response.split("Content-Length: ")[1].split("\r")[0])

        msg2(verbose, f"File: {file_name}, Size: {file_size} bytes")

        # Download file
        msg2(verbose, f"WARNING: Downloading: {file_name} to {download_dir}")
        if os.path.isfile(file_path):
            os.remove(file_path)
        download_file(f"ftp://{ftp_host}{ftp_path}{file_name}", file_path, verbose)

        # Download MD5 file
        if os.path.isfile(md5_file_path):
            os.remove(md5_file_path)
        download_file(f"ftp://{ftp_host}{ftp_path}{md5_file_name}", md5_file_path, verbose)

        # Check MD5
        verify_md5(file_path, md5_file_path, verbose)

    total_size_human = subprocess.check_output(['numfmt', '--to=iec-i', '--suffix=B', str(total_size)]).decode().strip()
    msg2(verbose, f"Total size of abstract files: {total_size_human}")

def create_pubcsv_dataset(m: ReferenceData) -> List:
    """ Takes about 14min for 30 (2 per minute) """
    
    abstract_length_threshold = m.abstract_length_threshold
    pub_inpath = m.pub_inpath()
    pub_outpath = m.pub_outpath()
    verbose = m.verbose

    import os
    import glob
    
    csv_list = []
    # Iterate through files in the directory
    for filepath in glob.glob(os.path.join(pub_inpath, "pubmed*.xml.gz")):
        msg2(verbose, f"Converting file {filepath}")
        if os.path.isfile(filepath):
            filename = os.path.basename(filepath)
            df = get_pub_df(filename=filename, inpath=pub_inpath, outpath= pub_outpath, prune=True, length_threshold = abstract_length_threshold, verbose = verbose)
            csv_filepath = os.path.join(pub_outpath, f"{filename}.csv")
            df.to_csv(csv_filepath, header=False, index=False, sep="\t")
            msg2(verbose, f"Wrote file:{csv_filepath}")
            csv_list.append(csv_filepath)
            
    return(csv_list)

# Quick Start

## Reload libraries in case the changed

In [80]:
# setup
import gpubs
from gpubs.models import ReferenceData
from gpubs.api import create_gene_reference_data, create_frequency_list, create_search_terms_file, create_filtered_search_terms, fetch_abstracts, create_pubcsv_dataset, create_gene_files

import importlib
importlib.reload(gpubs)
importlib.reload(gpubs.models)
importlib.reload(gpubs.api)

import gpubs
from gpubs.models import ReferenceData
from gpubs.api import create_gene_reference_data


## Pipeline

In [89]:
# Create data model
m = ReferenceData(version = "../../v1",       # make data root above any git repo
                  verbose = 2,                # print all the info messages
                  num_abstract_xml_files = 5, # only fetch 5 files from NCBI
                  dbxrefs = ["AllianceGenome.txt", "Ensembl.txt", "HGNC.txt", "IMGT_GENE-DB.txt"]  # exclude miRNA and MIM

                 )

# check the modelvalues
m

version_root=/home/krobasky/prompt/repo/gpubs/src/../../v1/data/
Created directory structure.


ReferenceData(ncbi_gene_info_url='https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', data_root='data/', raw_data_path='raw/', reference_data_path='reference/', dbxref_reference_data_path='dbxrefs/', dbxrefs=['AllianceGenome.txt', 'Ensembl.txt', 'HGNC.txt', 'IMGT_GENE-DB.txt'], gene_info_filename='gene_info.gz', gene_symbols_filename='gene_symbols.txt', gene_synonyms_filename='gene_synonyms.txt', search_terms_path='search_terms/', frequency_list_filename='frequency_list.txt', corpus_stop_word_list_length=4000, custom_stop_words=['ago', 'aim', 'amid', 'april', 'arch', 'bed', 'bite', 'bug', 'cage', 'co', 'crop', 'damage', 'danger', 'digit', 'et', 'fast', 'fat', 'fate', 'fire', 'flower', 'gap', 'genesis', 'gov', 'gpa', 'grasp', 'ii', 'inos', 'iv', 'killer', 'lab', 'lamp', 'laser', 'map', 'mask', 'mater', 'melt', 'mice', 'minor', 'miss', 'mv', 'nail', 'net', 'not', 'osf', 'pan', 'par', 'pha', 'rab', 'race', 'rain', 'rank', 'san', 'sand', 'se', 'sink', 'sof

In [90]:
# Fetch data/raw/gene_info.gz and create the human genes lists under data/reference (gene_symbols.txt, gene_synonyms.txt, dbxrefs/*)
create_gene_reference_data(m)

Download completed.
Gene symbols saved to /home/krobasky/prompt/repo/gpubs/src/../../v1/data/reference/gene_symbols.txt
dbXrefs saved to individual files.
Gene synonyms saved to /home/krobasky/prompt/repo/gpubs/src/../../v1/data/reference/gene_synonyms.txt


In [91]:
# The goal of the following 3 calls is to 
# create data/search_terms/filtered_terms.txt from english language corpus

# Create a word frequency list from an English language corpus
_ = create_frequency_list(m)

[nltk_data] Downloading package brown to /home/krobasky/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Wrote /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/frequency_list.txt


In [92]:
# Create the file of gene search terms (data/search_terms/search_terms.txt) using stop words from frequency list
create_search_terms_file(m)

Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt.
Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt.unsorted - can be removed.
Number of lines in /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt: 338143


In [93]:
# Create the filtered_terms.txt file
final_terms = create_filtered_search_terms(m)

Number of original search_terms:338143
number of filtered_terms:337951
final number of final_terms:338143
 number of matched_stop_words:192
matched_stop_words=['ABO', 'ACE', 'ACT', 'AF', 'AGO', 'AID', 'AIM', 'AIR', 'ALL', 'AM', 'AMID', 'AN', 'APRIL', 'APT', 'ARC', 'ARCH', 'ARM', 'ARMS', 'ART', 'AS', 'ASK', 'AT', 'BAD', 'BANK', 'BASE', 'BED', 'BEST', 'BITE', 'BOD', 'BORIS', 'BRIGHT', 'BUG', 'CAGE', 'CALL', 'CAN', 'CAR', 'CAT', 'CELL', 'CHIP', 'CO', 'CROP', 'DAMAGE', 'DANGER', 'DC', 'DIGIT', 'DO', 'END', 'ET', 'ETA', 'FACE', 'FACT', 'FAST', 'FAT', 'FATE', 'FIND', 'FIRE', 'FLOWER', 'FOR', 'GAP', 'GAS', 'Genesis', 'GET', 'GO', 'GOV', 'GPA', 'GRASP', 'GREAT', 'H', 'HAD', 'HAS', 'HE', 'hELD', 'HIS', 'hole', 'HOT', 'HR', 'iCE', 'ICE', 'IF', 'II', 'IMPACT', 'IN', 'INOS', 'IV', 'JET', 'KILLER', 'LAB', 'LAMP', 'LARGE', 'LASER', 'LED', 'LIGHT', 'LIME', 'LIMIT', 'MA', 'MAIL', 'MAP', 'MARCH', 'MARK', 'MARS', 'MASK', 'MASS', 'MATER', 'ME', 'MELT', 'MEN', 'Met', 'MET', 'MG', 'MICE', 'MINOR', 'MISS', 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/krobasky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/filtered_terms.txt


In [94]:
# check length of final terms
len(final_terms)

338143

In [95]:
# Fetch NCBI articl zips
# - There are about 1100 files with about 15000 abstracts each.
# - ~60GB is needed to get all files
# - At about 2 min/file ... ~ 2 days to get 'em all
m.num_abstract_xml_files=3 # set to -1 to get all files
fetch_abstracts(m)

Download Directory: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/
Number of abstracts to ensure have been downloaded: 3
Refresh: False
Total number of NCBI abstract XML files: 1166
latest_files 3: ['pubmed23n1166.xml.gz', 'pubmed23n1165.xml.gz', 'pubmed23n1164.xml.gz']
Predicted download size = 150MiB, Available space = 111GiB
SKIP: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1166.xml.gz exists.
SKIP: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1165.xml.gz exists.
SKIP: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1164.xml.gz exists.
Total size of abstract files: 150MiB


In [96]:
#%%bash
# this would probably be faster, but harder to maintain
#VERSION_ROOT=v1/data
#VERBOSE=1
#./gpubs/scripts/download_pubs.sh -n 5 -d ${VERSION_ROOT}/raw/pubs -v ${VERBOSE} 2> download.err

In [97]:
%%time

# Create CSVs from XMLs
# - This takes about 3 minutes to do 10 files; or about 5 hours to do them all
# - Here we only need about a minute to do the 3 files we downloaded
csv_list = create_pubcsv_dataset(m)

Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1165.xml.gz
Number of all articles:29996
Number of all abstracts before pruning short articles = 25905
Number after pruning short articles = 16511
Number discarded for being too short: 9394
Number of pruned articles:16511
Wrote file:/home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/pubmed23n1165.xml.gz.csv
Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1166.xml.gz
Number of all articles:10710
Number of all abstracts before pruning short articles = 9250
Number after pruning short articles = 5558
Number discarded for being too short: 3692
Number of pruned articles:5558
Wrote file:/home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/pubmed23n1166.xml.gz.csv
Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1164.xml.gz
Number of all articles:29986
Number of all abstracts before pruning short articles = 26739
Number afte

In [98]:
%%time
# Create new CSVs that include GENES column under data/csvpubs/genes
# - Takes about 40s for 10 files, which is much slower than just running the awk script
# - Here, it should only take a few seconds for the 3 files we downloaded
# - With default settings, it filters out about 42% of the abstracts, most of which are 2022
create_gene_files(m)

Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
CPU times: user 7.72 ms, sys: 69.7 ms, total: 77.4 ms
Wall time: 5.31 s


In [99]:
#%%bash
# This is SO much faster, but not as sustainable.
#./gpubs/scripts/search.awk \
#  ./v4/data/search_terms/filtered_terms.txt \
#  ./v4/data/csvpubs/pubmed23n1166.xml.gz.csv \
#> ./v4/data/csvpubs/genes/pubmed23n1166.xml.gz.csv 2> ./v4/data/csvpubs/genes/pubmed23n1166.xml.gz.csv.err


In [100]:
# Check your work
# field 10 has the genes
!awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
! wc -l ../../v1/data/csvpubs/genes/*.csv

22932
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
   39412 total


In [101]:
%%bash

# Check your work
# - If there are common words (like 'maps'), check in gene_info.gz if every occurrence is all-caps, and if so, add it to the custom_stop_words array in ReferenceData

# field 10 has the genes
awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
wc -l ../../v1/data/csvpubs/genes/*.csv
cat ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv.err
awk -F'\t' '$10 != "" {print $10}' ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv|head -120|tail -40


22932
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
   39412 total
maps
SR
MB
toll
ANOVA
CT
rim
CT
Dkk1
MI
AS,TNC
DM
GDF-15
clock
STAT3,T-bet,IL-17A,TSC1,TSC2,IL-17F,M1,LPS,IL-17,MTOR,TSC,DSS
CD8,EGFR
STING,cGAS
APE1,GAD
CD4,CCl
Mb
IV
CI,HR
tech
MRS,SD
CI
AST
RPE
Cord,SCS,cord
OT,ROM,grip
AIS,DAO
II
CT
TNT
STR
CT
ASA
DM,KSA,SD
MIS
AH,atopy
CI
