# Quick Start

## Reload libraries in case the changed

In [68]:
# setup
import gpubs
import gpubs.parse
from gpubs.models import ReferenceData
from gpubs.api import create_gene_reference_data, create_frequency_list, create_search_terms_file, create_filtered_search_terms, fetch_abstracts, create_pubcsv_dataset, create_gene_files

import importlib
importlib.reload(gpubs)
importlib.reload(gpubs.models)
importlib.reload(gpubs.parse)
importlib.reload(gpubs.api)

import gpubs
import gpubs.parse
from gpubs.parse import parse_pubs
from gpubs.models import ReferenceData
from gpubs.api import create_gene_reference_data, create_frequency_list, create_search_terms_file, create_filtered_search_terms, fetch_abstracts, create_pubcsv_dataset, create_gene_files

## Pipeline

## Data model

In [40]:
# Create data model
m = ReferenceData(version = "../../v1",       # make data root above any git repo
                  verbose = 2,                # print all the info messages
                  num_abstract_xml_files = 5, # only fetch 5 files from NCBI
                  dbxrefs = ["AllianceGenome.txt", "Ensembl.txt", "HGNC.txt", "IMGT_GENE-DB.txt"]  # exclude miRNA and MIM

                 )

# check the modelvalues
m

version_root=/home/krobasky/prompt/repo/gpubs/src/../../v1/data/
Created directory structure.


ReferenceData(ncbi_gene_info_url='https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', ncbi_ftp_host='ftp.ncbi.nlm.nih.gov', ncbi_ftp_baseline_path='/pubmed/baseline/', ncbi_ftp_updatefiles_path='/pubmed/updatefiles/', data_root='data/', raw_data_path='raw/', reference_data_path='reference/', dbxref_reference_data_path='dbxrefs/', dbxrefs=['AllianceGenome.txt', 'Ensembl.txt', 'HGNC.txt', 'IMGT_GENE-DB.txt'], gene_info_filename='gene_info.gz', gene_symbols_filename='gene_symbols.txt', gene_synonyms_filename='gene_synonyms.txt', search_terms_path='search_terms/', frequency_list_filename='frequency_list.txt', corpus_stop_word_list_length=4000, custom_stop_words=['ago', 'aim', 'amid', 'april', 'arch', 'bed', 'bite', 'bug', 'cage', 'co', 'crop', 'damage', 'danger', 'digit', 'et', 'fast', 'fat', 'fate', 'fire', 'flower', 'gap', 'genesis', 'gov', 'gpa', 'grasp', 'ii', 'inos', 'iv', 'killer', 'lab', 'lamp', 'laser', 'map', 'mask', 'mater', 'melt', 'mice', 'mino

## Gene terms: fetch and parse

In [73]:
# Fetch data/raw/gene_info.gz and create the human genes lists under data/reference (gene_symbols.txt, gene_synonyms.txt, dbxrefs/*)
create_gene_reference_data(m)

Download completed.
Gene symbols saved to /home/krobasky/prompt/repo/gpubs/src/../../v1/data/reference/gene_symbols.txt
dbXrefs saved to individual files.
Gene synonyms saved to /home/krobasky/prompt/repo/gpubs/src/../../v1/data/reference/gene_synonyms.txt


In [74]:
# The goal of the following 3 calls is to 
# create data/search_terms/filtered_terms.txt from english language corpus

# Create a word frequency list from an English language corpus
_ = create_frequency_list(m)

[nltk_data] Downloading package brown to /home/krobasky/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Wrote /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/frequency_list.txt


In [75]:
# Create the file of gene search terms (data/search_terms/search_terms.txt) using stop words from frequency list
create_search_terms_file(m)

Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt.
Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt.unsorted - can be removed.
Number of lines in /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/search_terms.txt: 338099


In [76]:
# Create the filtered_terms.txt file
final_terms = create_filtered_search_terms(m)

Number of original search_terms:338099
number of filtered_terms:337851
final number of final_terms:338099
 number of matched_stop_words:248
matched_stop_words=['ABO', 'ACE', 'ACT', 'AF', 'AGO', 'AID', 'AIM', 'AIR', 'ALL', 'AM', 'AMID', 'AN', 'APEX', 'APP', 'APPS', 'APRIL', 'APT', 'ARC', 'ARCH', 'ARGS', 'ARM', 'ARMS', 'ART', 'AS', 'ASH', 'ASK', 'AT', 'BAD', 'BANK', 'BAR', 'BASE', 'BED', 'BEST', 'BIT', 'BITE', 'BOD', 'BORIS', 'BRIGHT', 'BUG', 'CAGE', 'CALL', 'CAN', 'CAR', 'CAST', 'CAT', 'CATS', 'CAVA', 'CD', 'CELL', 'CHIP', 'CLOCK', 'CO', 'COIL', 'COPE', 'CORD', 'CROP', 'DAMAGE', 'DANGER', 'DC', 'Delta', 'DELTA', 'DIGIT', 'DO', 'EAT', 'EG', 'END', 'ET', 'ETA', 'FACE', 'FACT', 'FAST', 'FAT', 'FATE', 'FATS', 'FIND', 'FIRE', 'FISH', 'FIX', 'FLAME', 'FLAP', 'FLOWER', 'FOR', 'FUSE', 'GAP', 'GAS', 'Genesis', 'GET', 'GO', 'GOV', 'GPA', 'GRASP', 'GREAT', 'GRID', 'GUM', 'H', 'HAD', 'HAS', 'HE', 'HEAL', 'hELD', 'HIP', 'HIS', 'HITS', 'hole', 'HOT', 'HR', 'HUB', 'iCE', 'ICE', 'IF', 'IGM', 'II', 'IMP

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/krobasky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Created /home/krobasky/prompt/repo/gpubs/src/../../v1/data/search_terms/filtered_terms.txt


In [77]:
# check length of final terms
len(final_terms)

338099

## Pubs: fetch and parse

In [41]:
m.num_abstract_xml_files=3 # set to -1 to get all files

# Fetch baseline NCBI article zips
# - There are about 1100 files in baseline (2023) with about 15000 abstracts each.
# - ~60GB is needed to get all files
# - Under an hour on AWS
fetch_abstracts(m) 

Download Directory: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/
Number of abstracts to ensure have been downloaded: 3
Refresh: False
file_list curl: curl -s ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
Total number of NCBI abstract XML files: 1166
latest_files 3: ['pubmed23n1166.xml.gz', 'pubmed23n1165.xml.gz', 'pubmed23n1164.xml.gz']
Predicted download size = 150MiB, Available space = 108GiB
File: pubmed23n1166.xml.gz, Size: 23756263 bytes
/home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1166.xml.gz.md5: OK - MD5 checksum verification succeeded.
File: pubmed23n1165.xml.gz, Size: 68303781 bytes
/home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1165.xml.gz.md5: OK - MD5 checksum verification succeeded.
File: pubmed23n1164.xml.gz, Size: 64435332 bytes
/home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1164.xml.gz.md5: OK - MD5 checksum verification succeeded.
Total size of abstract files: 150MiB


In [42]:
# Fetch NCBI updatefiles
# 222 updatefiles in June 2023
fetch_abstracts(m, get_updates = True) 

! Getting update files, download_dir=/home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/updates/.
Download Directory: /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/updates/
Number of abstracts to ensure have been downloaded: 3
Refresh: False
file_list curl: curl -s ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
Total number of NCBI abstract XML files: 222
latest_files 3: ['pubmed23n1388.xml.gz', 'pubmed23n1387.xml.gz', 'pubmed23n1386.xml.gz']
Predicted download size = 86MiB, Available space = 108GiB
File: pubmed23n1388.xml.gz, Size: 16939468 bytes
/home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/updates/pubmed23n1388.xml.gz.md5: OK - MD5 checksum verification succeeded.
File: pubmed23n1387.xml.gz, Size: 35175073 bytes
/home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/updates/pubmed23n1387.xml.gz.md5: OK - MD5 checksum verification succeeded.
File: pubmed23n1386.xml.gz, Size: 37986545 bytes
/home/krobasky/prompt/repo/gpubs/src/../../v1/data/ra

In [43]:
#%%bash
# this would probably be faster, but harder to maintain
#VERSION_ROOT=v1/data
#VERBOSE=1
#./gpubs/scripts/download_pubs.sh -n 5 -d ${VERSION_ROOT}/raw/pubs -v ${VERBOSE} 2> download.err

In [69]:
%%time

# Create CSVs from XMLs
# - This takes about 3 minutes to do 10 files; or about 5 hours to do all 1165 baselines + 
# - Here we only need about a minute to do the 3 files we downloaded
csv_list = create_pubcsv_dataset(m)

Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1165.xml.gz
Number of all articles:29996
Number of all abstracts before pruning short articles = 25905
Number after pruning short articles = 16511
Number discarded for being too short: 9394
Number of pruned articles:16511
Wrote file:/home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/pubmed23n1165.xml.gz.csv
Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1166.xml.gz
Number of all articles:10710
Number of all abstracts before pruning short articles = 9250
Number after pruning short articles = 5558
Number discarded for being too short: 3692
Number of pruned articles:5558
Wrote file:/home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/pubmed23n1166.xml.gz.csv
Converting file /home/krobasky/prompt/repo/gpubs/src/../../v1/data/raw/pubs/pubmed23n1164.xml.gz
Number of all articles:29986
Number of all abstracts before pruning short articles = 26739
Number afte

## Annotate with genes

In [78]:
%%time
# Create new CSVs that include GENES column under data/csvpubs/genes
# - Takes about 40s for 10 files, which is much slower than just running the awk script
# - Here, it should only take a few seconds for the 3 files we downloaded
# - With default settings, it filters out about 42% of the abstracts, most of which are 2022
create_gene_files(m)

Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1387.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1388.xml.gz.csv
Creating /home/krobasky/prompt/repo/gpubs/src/../../v1/data/csvpubs/genes/pubmed23n1386.xml.gz.csv
CPU times: user 10.1 ms, sys: 29 ms, total: 39.1 ms
Wall time: 8.29 s


In [79]:
#%%bash
# This is SO much faster, but not as sustainable.
#./gpubs/scripts/search.awk \
#  ./v4/data/search_terms/filtered_terms.txt \
#  ./v4/data/csvpubs/pubmed23n1166.xml.gz.csv \
#> ./v4/data/csvpubs/genes/pubmed23n1166.xml.gz.csv 2> ./v4/data/csvpubs/genes/pubmed23n1166.xml.gz.csv.err


In [80]:
# Check your work
# field 10 has the genes
!awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
! wc -l ../../v1/data/csvpubs/genes/*.csv

31680
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
    7843 ../../v1/data/csvpubs/genes/pubmed23n1386.xml.gz.csv
    6584 ../../v1/data/csvpubs/genes/pubmed23n1387.xml.gz.csv
    3275 ../../v1/data/csvpubs/genes/pubmed23n1388.xml.gz.csv
   57114 total


In [83]:
# Check your work
# field 10 has the genes
!awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
! wc -l ../../v1/data/csvpubs/genes/*.csv

31680
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
    7843 ../../v1/data/csvpubs/genes/pubmed23n1386.xml.gz.csv
    6584 ../../v1/data/csvpubs/genes/pubmed23n1387.xml.gz.csv
    3275 ../../v1/data/csvpubs/genes/pubmed23n1388.xml.gz.csv
   57114 total


In [87]:
print(f"Fraction of abstracts tagged with genes: {31680/57114}")

Fraction of abstracts tagged with genes: 0.554680113457296


In [91]:
%%bash

# Check your work
# - If there are common words (like 'maps'), check in gene_info.gz if every occurrence is all-caps, and if so, add it to the custom_stop_words array in ReferenceData

# field 10 has the genes
awk -F'\t' '$10 != ""{print $10}' ../../v1/data/csvpubs/genes/*.xml.gz.csv|wc -l
wc -l ../../v1/data/csvpubs/genes/*.csv
cat ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv.err
awk -F'\t' '$10 != "" {print $10}' ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv|head -120|tail -40


31680
   17326 ../../v1/data/csvpubs/genes/pubmed23n1164.xml.gz.csv
   16528 ../../v1/data/csvpubs/genes/pubmed23n1165.xml.gz.csv
    5558 ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv
    7843 ../../v1/data/csvpubs/genes/pubmed23n1386.xml.gz.csv
    6584 ../../v1/data/csvpubs/genes/pubmed23n1387.xml.gz.csv
    3275 ../../v1/data/csvpubs/genes/pubmed23n1388.xml.gz.csv
   57114 total
rim
CT
Dkk1
MI
AS,TNC
DM
GDF-15
STAT3,T-bet,IL-17A,TSC1,TSC2,IL-17F,M1,LPS,IL-17,MTOR,TSC,DSS
CD8,EGFR
STING,cGAS
APE1,GAD
CD4,CCl
Mb
IV
CI,HR
tech
MRS,SD
CI
AST
RPE
SCS
OT,ROM,grip
AIS,DAO
II
CT
TNT
STR
CT
ASA
DM,KSA,SD
MIS
AH,atopy
CI
UK
TTM
ADAMTS13,TTP
Iceberg,spurt
IV,II,ASA
PM,UAP
ML


In [95]:
%%bash
awk -F'\t' '$10 != "" {print $10}' ../../v1/data/csvpubs/genes/pubmed23n1166.xml.gz.csv|sed 's/,/\n/g'|sort|uniq -c|sort -g -r|head -100

    101 T
     79 CI
     78 P
     70 II
     63 CT
     57 MRI
     55 ROS
     47 OA
     38 UK
     34 polymerase
     31 H
     30 MS
     29 HCC
     28 SD
     27 IL-6
     26 OS
     25 GO
     25 era
     24 NMR
     22 Fe
     21 HR
     20 PA
     20 IV
     20 DM
     19 ML
     19 LPS
     19 HE
     19 ER
     17 ATP
     17 ASD
     16 TNF
     16 DEGs
     16 Cox
     16 CO
     16 BP
     15 PAH
     15 mTOR
     15 lobe
     14 Pb
     14 CRP
     14 CD8
     13 VEGF
     13 SA
     13 RT
     13 PD-1
     13 NPs
     13 ECM
     13 CSF
     13 Cr
     13 CD
     13 AS
     13 AR
     12 TEM
     12 PE
     12 p53
     12 NLRP3
     12 MR
     12 lag
     12 EMT
     12 EGFR
     12 CP
     12 bp
     11 VIP
     11 STAT3
     11 protease
     11 M1
     11 GC
     11 AST
     11 ANOVA
     10 zeta
     10 T1
     10 SOD
     10 SCD
     10 PD-L1
     10 PCA
     10 NIR
     10 MOF
     10 GI
     10 FDG
     10 DSC
      9 PM
      9 PH
      9 PC
      9 Nrf2
      