## Download and parse all Pubmed abstracts with the search term 'taste'

The following functions are defined which implement the Entrez object of Biopython:
- `search` : searches the Pubmed database with a query term and maximum number of hits. A list of Pubmed ID's is returned.
- `pullarticles` : takes the ID list (and the name of a SQLite database already set up with an `articles` table) and pulls all Pubmed articles that have an abstract and stores them in the database.

The functions are run with the search term `taste` and the database `taste.db`. 

In [14]:
import sys, numpy, math, sqlite3, re
sys.path.append('/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages')
from Bio import Entrez

In [64]:
def search(query,maxhits):
    maxhits = str(maxhits)
    Entrez.email = 'lvonbuchholtz@mail.nih.gov'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax=maxhits,
                            retmode='xml',
                            term=query)
    
    result = Entrez.read(handle)
    return result['IdList']


In [78]:
def pullarticles(idlist,database):


    ## since pubmed eutils return a maximum of 9975 abstracts split id list into even chunks of less than 9974
    chunks = math.ceil(len(idlist)/9975)
    idlist = numpy.array_split(numpy.array(idlist), chunks)

    ## get database ready
    conn = sqlite3.connect(database)
    c = conn.cursor()

    ## setup id lists
    failed = []
    succeded = []
    failedindex = []
    notitle = []
    noabstract = []
    noabstracttext = []
    nojournal = []
    nodate = []
    noyear = []
    nomonth = []
    noday = []

    ## pull full information from pubmed in chunks of less than 9975

    for i in range(chunks):
        ids = ','.join(idlist[i])
        Entrez.email = 'lvonbuchholtz@mail.nih.gov'
        handle = Entrez.efetch(db='pubmed',
                               retmode='xml',
                               id=ids)
        papers = Entrez.read(handle)


        for j, paper in enumerate(papers['PubmedArticle']):
            pmid = int(paper['MedlineCitation']['PMID'])
            try:
                article = paper['MedlineCitation']['Article']

                if 'Abstract' in article:
                    abstract = paper['MedlineCitation']['Article']['Abstract']

                    if 'ArticleTitle' in article:
                        title = article['ArticleTitle'];
                    else:
                        title = 'None'
                        notitle.append(pmid)

                    if 'AbstractText' in abstract:
                        abstracttext = abstract['AbstractText'][0]
                    else:
                        abstracttext = 'None'
                        noabstracttext.append(pmid)

                    if 'Journal' in article:
                        journal = article['Journal']['Title']
                    else:
                        journal = 'None'
                        nojournal.append(pmid)
                        
                    if 'DateCreated' in paper['MedlineCitation']:
                        date = paper['MedlineCitation']['DateCreated']
                        if 'Year' in date:
                            year = int(date['Year'])
                        else:
                            year = 0
                            noyear.append(pmid)
                        if 'Month' in date:
                            month = int(date['Month'])
                        else:
                            month = 0
                            nomonth.append(pmid)
                        if 'Day' in date:
                            day = int(date['Day'])
                        else:
                            day = 0
                            noday.append(pmid)
                    else:
                        year = 0
                        month = 0
                        day = 0
                        nodate.append(pmid)
                        
                    c.execute("INSERT OR REPLACE INTO articles VALUES (?,?,?,?,?,?,?)",
                              (pmid, title, abstracttext, journal,year,month,day))
                    succeded.append(pmid)

                else:
                    noabstract.append(pmid)

            except:
                failedindex.append(j)
                failed.append(pmid)

    print(len(failed), " attempts failed", )
    print(len(succeded), " attempts succeded", )
    print(len(noabstract), " attempts had no abstract", )
    print(len(noabstracttext), " attempts had abstract but no abstract text", )
    print(len(nojournal), " attempts had no journal", )
    print(len(notitle), " attempts had no title", )
    print(len(nodate), " attempts had no date", )
    print(len(noyear), " attempts had no year", )
    print(len(nomonth), " attempts had no month", )
    print(len(noday), " attempts had no day", )
    
    conn.commit()
    return succeded



In [79]:
if __name__ == '__main__':
    idlist = search('taste',40000)
    print(len(idlist), "ids received")
    succeded = pullarticles(idlist,'taste.db')
    print("finished")

34448 ids received
0  attempts failed
28694  attempts succeded
5700  attempts had no abstract
0  attempts had abstract but no abstract text
0  attempts had no journal
0  attempts had no title
0  attempts had no date
0  attempts had no year
0  attempts had no month
0  attempts had no day
finished


Some more functions to write and read all Pubmed ID's to/from a text file:

In [81]:
## get all pmids from database
def getallpmids(database):

    conn = sqlite3.connect(database)
    c = conn.cursor()
    c.execute('''SELECT pmid FROM articles''')
    pmids = c.fetchall()
    conn.close()
    return pmids

## get all pmids from database and write them to textfile
def writeallpmidstofile(database,filename):
    idlist = getallpmids(database)
    with open(filename, 'w') as idfile:
        for item in idlist:
            idfile.write("%s\n" % item)

def readpmidsfromfile(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines


In [83]:
writeallpmidstofile('taste.db','allpmids.txt')

## Make Gene List from Synonyms in public databases

Mouse gene synonym data was downloaded from MGI (http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt) and parsed into a table `synonympairs` in the 'taste.db' SQLite database. Human genes could be found here:
http://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&status=Approved&status=Entry+Withdrawn&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit

In [19]:
## from a list of mouse genetic markers downloaded from MGI repository at jax,
## http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt,
## extract all synonyms for a given gene, these include the genesymbol, the gene name and alternative synonyms
## then write synonym/gene pairs to the table 'synonympairs' in the database
    
import re

with open('mouse.txt') as f: 
    content = f.readlines()
    
## get database ready
conn = sqlite3.connect('taste.db')
c = conn.cursor()
    
splitlines = [re.split('\t',line.rstrip()) for line in content]
for line in splitlines:
    if line[10] == 'protein coding gene':
        genesymbol = line[6]
        c.execute("INSERT OR REPLACE INTO synonympairs VALUES (?,?)",(genesymbol, genesymbol))
        genename = line[8]
        c.execute("INSERT OR REPLACE INTO synonympairs VALUES (?,?)",(genename, genesymbol))
        if len(line) > 11:
            synonyms = re.split('\|',line[11])
            for synonym in synonyms:
                c.execute("INSERT OR REPLACE INTO synonympairs VALUES (?,?)",(synonym, genesymbol))
        
        
conn.commit()
conn.close()