# Quick Start

## Reload libraries in case the changed

In [1]:
# setup
import gpubs
from gpubs.models import ReferenceData
from gpubs.api import create_gene_reference_data, create_frequency_list, create_search_terms_file, create_filtered_search_terms, fetch_abstracts, create_pubcsv_dataset, create_gene_files

import importlib
importlib.reload(gpubs)
importlib.reload(gpubs.models)
importlib.reload(gpubs.api)

import gpubs
from gpubs.models import ReferenceData
from gpubs.api import create_gene_reference_data


## Pipeline

In [2]:
# Create data model
m = ReferenceData(version = "../../v1",       # make data root above any git repo
                  verbose = 2,                # print all the info messages
                  num_abstract_xml_files = 5, # only fetch 5 files from NCBI
                  dbxrefs = ["AllianceGenome.txt", "Ensembl.txt", "HGNC.txt", "IMGT_GENE-DB.txt"]  # exclude miRNA and MIM

                 )

# check the modelvalues
m

version_root=/home/krobasky/prompt/repo/gpubs/src/../../v1/data/
Created directory structure.


ReferenceData(ncbi_gene_info_url='https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', data_root='data/', raw_data_path='raw/', reference_data_path='reference/', dbxref_reference_data_path='dbxrefs/', dbxrefs=['AllianceGenome.txt', 'Ensembl.txt', 'HGNC.txt', 'IMGT_GENE-DB.txt'], gene_info_filename='gene_info.gz', gene_symbols_filename='gene_symbols.txt', gene_synonyms_filename='gene_synonyms.txt', search_terms_path='search_terms/', frequency_list_filename='frequency_list.txt', corpus_stop_word_list_length=4000, custom_stop_words=['ago', 'aim', 'amid', 'april', 'arch', 'bed', 'bite', 'bug', 'cage', 'co', 'crop', 'damage', 'danger', 'digit', 'et', 'fast', 'fat', 'fate', 'fire', 'flower', 'gap', 'genesis', 'gov', 'gpa', 'grasp', 'ii', 'inos', 'iv', 'killer', 'lab', 'lamp', 'laser', 'map', 'mask', 'mater', 'melt', 'mice', 'minor', 'miss', 'mv', 'nail', 'net', 'not', 'osf', 'pan', 'par', 'pha', 'rab', 'race', 'rain', 'rank', 'san', 'sand', 'se', 'sink', 'sof

In [3]:
# Fetch data/raw/gene_info.gz and create the human genes lists under data/reference (gene_symbols.txt, gene_synonyms.txt, dbxrefs/*)
create_gene_reference_data(m)

Download completed.
Gene symbols saved to /home/krobasky/prompt/repo/gpubs/src/../../v1/data/reference/gene_symbols.txt
dbXrefs saved to individual files.
Gene synonyms saved to /home/krobasky/prompt/repo/gpubs/src/../../v1/data/reference/gene_synonyms.txt


In [4]:
# The goal of the following 3 calls is to 
# create data/search_terms/filtered_terms.txt from english language corpus

# Create a word frequency list from an English language corpus
_ = create_frequency_list(m)

[nltk_data] Downloading package brown to /home/krobasky/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Wrote /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/frequency_list.txt


In [5]:
# Create the file of gene search terms (data/search_terms/search_terms.txt) using stop words from frequency list
create_search_terms_file(m)

Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt.
Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt.unsorted - can be removed.
Number of lines in /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt: 338125


In [6]:
# Create the filtered_terms.txt file
final_terms = create_filtered_search_terms(m)

Number of original search_terms:338125
number of filtered_terms:337877
final number of final_terms:338125
 number of matched_stop_words:248
matched_stop_words=['ABO', 'ACE', 'ACT', 'AF', 'AGO', 'AID', 'AIM', 'AIR', 'ALL', 'AM', 'AMID', 'AN', 'APEX', 'APP', 'APPS', 'APRIL', 'APT', 'ARC', 'ARCH', 'ARGS', 'ARM', 'ARMS', 'ART', 'AS', 'ASH', 'ASK', 'AT', 'BAD', 'BANK', 'BAR', 'BASE', 'BED', 'BEST', 'BIT', 'BITE', 'BOD', 'BORIS', 'BRIGHT', 'BUG', 'CAGE', 'CALL', 'CAN', 'CAR', 'CAST', 'CAT', 'CATS', 'CAVA', 'CD', 'CELL', 'CHIP', 'CLOCK', 'CO', 'COIL', 'COPE', 'CORD', 'CROP', 'DAMAGE', 'DANGER', 'DC', 'Delta', 'DELTA', 'DIGIT', 'DO', 'EAT', 'EG', 'END', 'ET', 'ETA', 'FACE', 'FACT', 'FAST', 'FAT', 'FATE', 'FATS', 'FIND', 'FIRE', 'FISH', 'FIX', 'FLAME', 'FLAP', 'FLOWER', 'FOR', 'FUSE', 'GAP', 'GAS', 'Genesis', 'GET', 'GO', 'GOV', 'GPA', 'GRASP', 'GREAT', 'GRID', 'GUM', 'H', 'HAD', 'HAS', 'HE', 'HEAL', 'hELD', 'HIP', 'HIS', 'HITS', 'hole', 'HOT', 'HR', 'HUB', 'iCE', 'ICE', 'IF', 'IGM', 'II', 'IMP

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/krobasky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/filtered_terms.txt


In [7]:
# check length of final terms
len(final_terms)

338125

In [9]:
# Fetch NCBI articl zips
# - There are about 1100 files with about 15000 abstracts each.
# - ~60GB is needed to get all files
# - At about 2 min/file ... ~ 2 days to get 'em all
m.num_abstract_xml_files=20 # set to -1 to get all files
fetch_abstracts(m)

Download Directory: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/
Number of abstracts to ensure have been downloaded: 20
Refresh: False
Total number of NCBI abstract XML files: 1166
latest_files 20: ['pubmed23n1166.xml.gz', 'pubmed23n1165.xml.gz', 'pubmed23n1164.xml.gz', 'pubmed23n1163.xml.gz', 'pubmed23n1162.xml.gz', 'pubmed23n1161.xml.gz', 'pubmed23n1160.xml.gz', 'pubmed23n1159.xml.gz', 'pubmed23n1158.xml.gz', 'pubmed23n1157.xml.gz', 'pubmed23n1156.xml.gz', 'pubmed23n1155.xml.gz', 'pubmed23n1154.xml.gz', 'pubmed23n1153.xml.gz', 'pubmed23n1152.xml.gz', 'pubmed23n1151.xml.gz', 'pubmed23n1150.xml.gz', 'pubmed23n1149.xml.gz', 'pubmed23n1148.xml.gz', 'pubmed23n1147.xml.gz']
Predicted download size = 1.3GiB, Available space = 110GiB
SKIP: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1166.xml.gz exists.
SKIP: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1165.xml.gz exists.
SKIP: /home/krobasky/prompt/repo/gpubs/src/../../v1/data

In [10]:
#%%bash
# this would probably be faster, but harder to maintain
#VERSION_ROOT=v1/data
#VERBOSE=1
#./gpubs/scripts/download_pubs.sh -n 5 -d ${VERSION_ROOT}/raw/pubs -v ${VERBOSE} 2> download.err

In [11]:
%%time

# Create CSVs from XMLs
# - This takes about 3 minutes to do 10 files; or about 5 hours to do them all
# - Here we only need about a minute to do the 3 files we downloaded
csv_list = create_pubcsv_dataset(m)

Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1160.xml.gz
Number of all articles:29995
Number of all abstracts before pruning short articles = 27392
Number after pruning short articles = 18053
Number discarded for being too short: 9339
Number of pruned articles:18053
Wrote file:/home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/pubmed23n1160.xml.gz.csv
Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1165.xml.gz
Number of all articles:29996
Number of all abstracts before pruning short articles = 25905
Number after pruning short articles = 16511
Number discarded for being too short: 9394
Number of pruned articles:16511
Wrote file:/home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/pubmed23n1165.xml.gz.csv
Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1156.xml.gz
Number of all articles:29997
Number of all abstracts before pruning short articles = 25770
Number a

In [12]:
%%time
# Create new CSVs that include GENES column under data/csvpubs/genes
# - Takes about 40s for 10 files, which is much slower than just running the awk script
# - Here, it should only take a few seconds for the 3 files we downloaded
# - With default settings, it filters out about 42% of the abstracts, most of which are 2022
create_gene_files(m)

Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1149.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1162.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1153.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1161.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1158.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1160.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1150.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1157.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1148.xml.gz.csv
Creating /

In [13]:
#%%bash
# This is SO much faster, but not as sustainable.
#./gpubs/scripts/search.awk \
#  ./v4/data/search_terms/filtered_terms.txt \
#  ./v4/data/csvpubs/pubmed23n1166.xml.gz.csv \
#> ./v4/data/csvpubs/genes/pubmed23n1166.xml.gz.csv 2> ./v4/data/csvpubs/genes/pubmed23n1166.xml.gz.csv.err


In [14]:
# Check your work
# field 10 has the genes
!awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
! wc -l ../../v1/data/csvpubs/genes/*.csv

176107
    16895 ../../v1/data/csvpubs/genes/pubmed23n1147.xml.gz.csv
    15643 ../../v1/data/csvpubs/genes/pubmed23n1148.xml.gz.csv
    16652 ../../v1/data/csvpubs/genes/pubmed23n1149.xml.gz.csv
    16110 ../../v1/data/csvpubs/genes/pubmed23n1150.xml.gz.csv
    13112 ../../v1/data/csvpubs/genes/pubmed23n1151.xml.gz.csv
    16399 ../../v1/data/csvpubs/genes/pubmed23n1152.xml.gz.csv
    17441 ../../v1/data/csvpubs/genes/pubmed23n1153.xml.gz.csv
    15916 ../../v1/data/csvpubs/genes/pubmed23n1154.xml.gz.csv
    17616 ../../v1/data/csvpubs/genes/pubmed23n1155.xml.gz.csv
    16461 ../../v1/data/csvpubs/genes/pubmed23n1156.xml.gz.csv
    15947 ../../v1/data/csvpubs/genes/pubmed23n1157.xml.gz.csv
    17852 ../../v1/data/csvpubs/genes/pubmed23n1158.xml.gz.csv
    16430 ../../v1/data/csvpubs/genes/pubmed23n1159.xml.gz.csv
    18152 ../../v1/data/csvpubs/genes/pubmed23n1160.xml.gz.csv
    15946 ../../v1/data/csvpubs/genes/pubmed23n1161.xml.gz.csv
    17901 ../../v1/data/csvpubs/genes/pubmed23n1

In [17]:
176107/317789, 22932/39412

(0.5541632970304196, 0.581853242667208)

In [17]:
# Check your work
# field 10 has the genes
!awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
! wc -l ../../v1/data/csvpubs/genes/*.csv

22932
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
   39412 total


In [18]:
%%bash

# Check your work
# - If there are common words (like 'maps'), check in gene_info.gz if every occurrence is all-caps, and if so, add it to the custom_stop_words array in ReferenceData

# field 10 has the genes
awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
wc -l ../../v1/data/csvpubs/genes/*.csv
cat ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv.err
awk -F'\t' '$10 != "" {print $10}' ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv|head -120|tail -40


22932
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
   39412 total
maps
SR
MB
toll
ANOVA
CT
rim
CT
Dkk1
MI
AS,TNC
DM
GDF-15
clock
STAT3,T-bet,IL-17A,TSC1,TSC2,IL-17F,M1,LPS,IL-17,MTOR,TSC,DSS
CD8,EGFR
STING,cGAS
APE1,GAD
CD4,CCl
Mb
IV
CI,HR
tech
MRS,SD
CI
AST
RPE
Cord,SCS,cord
OT,ROM,grip
AIS,DAO
II
CT
TNT
STR
CT
ASA
DM,KSA,SD
MIS
AH,atopy
CI
