In [1]:
import entrezpy
from Bio import Entrez
from Bio import Medline
import json
import io
from tqdm.notebook import tqdm
import pandas as pd
import requests
from collections import defaultdict
import xml.etree.ElementTree as ET
# Define XML to excel columns
COLUMN = ["PMID", "TI", "AB", "AU", "AF", "PD", "KW"]

In [3]:
# The default return format of biopython is xml. In order to get specific infromation,
# the files should be parsed accordingly

# https://github.com/mengqian989/vzlib/blob/master/pmc_extract/extract_pmc_7.py

# Print nested dictionary
def print_nestdict(d, indent = 0): 
    for k, v in d.items():
        print('\t' *indent + str(k))
        if isinstance(v, dict):
            print_nestdict(v, indent+1)
        else:
            print('\t' * (indent+1) + str(v))

            
def parse_handle(handle):
    from Bio import Entrez
    import io
    import pandas as pd
    '''
    parse handle returned by efetch with xml format
    arg: handle, entrez handle object
    return: df, dataframe having publication data
    '''
    if not isinstance(handle, io.TextIOWrapper):
        raise TypeError("Input should be handle object")
        
    if not fetchrecords['PubmedArticle']:
        raise ValueError("File is empty")
        
    records = Entrez.read(handle)
    df = pd.DataFrame(columns = COLUMN)
    for i in tqdm(fetchrecords['PubmedArticle']):
        
        rec = i['MedlineCitation']
        rec_art = rec['Article']

        au_list = [au_af['ForeName'] + " " + au_af['LastName'] 
                   if au_af['ForeName'] and au_af['ForeName'] else "name NA"
                       for au_af in rec_art['AuthorList']]
        aff_list = [au_af['AffiliationInfo'][0]['Affiliation']                        
                    if au_af['AffiliationInfo'][0] else "affiliation NA"
                        for au_af in rec_art['AuthorList']]

        pmid = "PMID NA"
        if rec['PMID']:
            pmid = rec['PMID']

        title = "title NA"
        if rec_art['ArticleTitle']:
            title = rec_art['ArticleTitle']

        date = "Date NA"
        if rec_art['ArticleDate']:
            date = rec_art['ArticleDate'][0]['Month'] + \
                   "/" + rec_art['ArticleDate'][0]['Day'] + \
                   "/" + rec_art['ArticleDate'][0]['Year']

        abstract = "Abstract NA"
        if 'Abstract' in rec_art.keys():
            abstract = rec_art['Abstract']['AbstractText']

        keywords = "Keywords NA"
        if rec['KeywordList']:
            keywords = "; ".join(j for j in rec['KeywordList'][0])
        rec_dict = {
            "PMID" : pmid,
            "TI" : title, 
            "AB" : abstract, 
            "PD": date, 
            "AU": "; ".join(j for j in au_list),
            "AF": "; ".join(j for j in aff_list),
            "KW": keywords
        }
            
        df = df.append(rec_dict, ignore_index=True)   
        print("Data are ready to export") 
    return df
    

    
def parse_pmc_xmlfile(xmlfile_name):
    '''
    Work with single xml file output from pmc
    Collect author, affiliation, key terms, pubdate, title, abstract
    
    Try with pmc_result5.xml
    arg: 
        xmlfile: downloaded xml files
    
    data example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=212403
    '''
    import xml.etree.ElementTree as ET
    title = "Title NA"
    abstract = "Abstract NA"    
    pmid = "pmid NA" 
    pmc = 'pmc NA'
    doi = 'doi NA'
    author_list = []
    aff_list = []
    aff_uniq = []
    au_aff = dict()
    body = "Fulltext NA"
    pub_date = 'Date NA'
    subj = []
    
    with open("xmlfile_name", 'r') as xml_f: 

        tree = ET.parse(xml_f)
        root = tree.getroot()
        records = root.findall("article/front")

        for record in records:

            # Get article id
            article_id = record.findall("article-meta/article-id")
            if article_id:
                for e in article_id:
                    if e.attrib['pub-id-type'] == 'pmid':
                        pmid = e.text
                    if e.attrib['pub-id-type'] == 'pmc':
                        pmc = e.text
                    if e.attrib['pub-id-type'] == 'doi':
                        doi = e.text 
                print(pmid)  

            # Get article title
            title = ''.join(record.find('article-meta/title-group/article-title').itertext())
            print(title)

            # Get artile abstract
            rec_abstract = record.find('article-meta/abstract')
            abstract = ' '.join(sec.text for sec in rec_abstract if rec_abstract) # need deal with xml format such as (<italic>KCNJ11</italic>),
            print(abstract)

            # Get publication date
            pdate = record.findall("article-meta/pub-date/[@pub-type = 'epub']")
            if pdate:
                pub_date = "/".join([i.find('month').text, i.find('day').text, i.find('year').text] for i in pdate)
                print(pub_date)


            # Get author and affiliation
            '''
            PMC xml affiliation information has no consistant format. 


            '''
            rec_au = record.findall("article-meta/contrib-group/contrib")
            rec_aff = record.findall("article-meta/contrib-group/aff")

            # co-author list
            author_list = []

            # author-affiliation map dict
            au_af_dict = defaultdict(list)

            # First collect affiliation information
            aff_map = defaultdict(tuple)
            if rec_aff:
                for sing_af in rec_aff:
                    sing_af_id = sing_af.attrib.get('id')
                    sing_af_instit = "; ".join(i.text for i in rec_aff.findall("institution-wrap/institution"))
                    print("institution is " + sing_af_instit)
                    sing_af_addr = sing_af.find("institution-wrap").tail 
                    print("address is " + sing_af_addr)
                    aff_map[sing_af_id] = ((sing_af_instit, sing_af_addr))
            else:
                print("No affiliation information available")

            print("affiliation map: ".format(aff_map)) 

            # Iterate all co-authors
            if rec_au:
                for au in rec_au:
                    surname = "surname NA"
                    givenname = "givenname NA"
                    af_list_temp = []
                    if "contrib-type" in au.attrib and au.attrib.get("contrib-type") == "author":
                        surname = au.find("name/surname").text
                        givenname = au.find("name/given-names").text

                        # add name to record author list
                        author_list.append(surname + ", " + givenname)

                        # get affiliation id for the current author
                    if au.findall("xref/[@ref-type]"):
                        for af_id in au.findall("xref/[@ref-type='aff']"):
                            af_list_temp.append(af_id.attrib["rid"])

                    print(surname + ", " + givenname)
                    print("Temporal aff list of the author: {}".format(af_list_temp))

                    # map affiliation id to affiliation information
                    aff_list = []    
                    if aff_map and af_list_temp:
                        affiliation = ''
                        for af_id in af_list_temp:
                            try:
                                au_af_dict[author_list[-1]].append(aff_map[af_id])
                            except Exeption as e:
                                print(e)

                    else:
                        print("Cannot map author to affiliation, not enough information")
                print(author_list)

            else:
                print("No author information")


            

In [None]:


            

            
            
        
        
        
        
#         elif e.tag == 'pub-date' :
#             e_date = []
#             for attr in e.attrib:

#                 # Find attribute with epub
#                 if 'epub' in e.attrib[attr]:
#                     e_date = e
#                 if e_date:
#                     pub_year = "year NA"
#                     pub_month = "month NA"
#                     pub_day = "date NA"
#                     for dy in e_date:
#                         if dy.tag == 'year' and dy.text != None:
#                             pub_year = dy.text
#                         elif dy.tag == 'month' and dy.text != None:
#                             pub_month = dy.text                                
#                         elif dy.tag == 'day' and dy.text != None:
#                             pub_day = dy.text
#                         pub_date = pub_month +"/"+pub_day+"/"+pub_year
#             print(pub_date)          
#         # Get author and affiliation 
#         elif e.tag == 'contrib-group':
#             for e_au in e.findall('contrib'):

#                 # author-affiliation Tuple
#                 au_af = []  
#                 au_aff_list = []
#                 name = ''
#                 if 'contrib-type' in e_au.attrib and e_au.attrib.get('contrib-type') == "author": 
#                     for au in e_au:
#                         if au.tag == "surname":
#                             if au.text:
#                                 name += au.text
#                             else:
#                                 name += 'Surname NA'
#                         elif au.tag == 'given-names':
#                             if au.text:
#                                 name += ', ' + au.text
#                         elif au.tag == 'xref':
#                             if 'ref-type' in au.attrib and au.attrib['ref-type'] == 'aff':
#                                 if 'rid' in au.attrib:
#                                     au_aff_list.append(au.attrib['rid'])

#                     # add auther and affiliation
#                     au_af.append((name, au_aff_list))
#                     author_list.append(name)
#                     aff_list.append(au_aff_list)
#             print(aff_list)
#             print(author_list)
#             print(au_af)