In [1]:
import bs4 as bs
import os
from multiprocessing import Pool

In [20]:
def write_paper(html):
    #ID
    doi = html.find("article-id",{"pub-id-type":"doi"}).text
    #Type
    type_ = html.find("subj-group",{"subj-group-type":"heading"}).text#,{"subj-group-type":"Discipline-v3"})
    #Dates
    published = "/".join([_.text for _ in html.find("pub-date",{"pub-type":"epub"})])
    received = "/".join([_.text for _ in html.find("date",{"date-type":"received"})])
    accepted = "/".join([_.text for _ in html.find("date",{"date-type":"accepted"})])
    
    #Title
    title = html.find("article-title").text
        
    #categories
    cats = []
    for cat in html.find_all("subj-group", {"subj-group-type":"Discipline-v3"}):
        cats.append(cat.find("subject").text)
        
    cats = " ::: ".join(list(set(cats)))
    return ["\t".join([doi,type_,published,received,accepted,title,cats])]
    

def write_authors(html):
    doi = html.find("article-id",{"pub-id-type":"doi"}).text
    
    authors = []
    for author in html.find_all("contrib",{"contrib-type":"author"}):
        
        #Author name
        try:
            name = " ::: ".join([_.text for _ in author.find("name")])
        except:
            continue #no author, a research group

        #Author ID (if available)
        orcid = author.find("contrib-id",{"contrib-id-type":"orcid"})
        if orcid is not None:
            orcid = orcid.text
        else:
            orcid = ""

        #Roles
        roles = " ::: ".join([_.text for _ in author.find_all("role")])

        #Addresses
        add = []
        corr = "0"
        for aff in author.find_all("xref"):
            rid = aff.get("rid")
            if "cor" in rid:
                corr = "1"
            else:
                
                try:
                    add.append(html.find("aff",{"id":rid}).find("addr-line").text)
                except:
                    pass #other field
                    
                

        add = " ::: ".join(add)

        authors.append("\t".join([doi,name,orcid,roles,add,corr]))
    
    return authors        
        

def write_editor(html):
    doi = html.find("article-id",{"pub-id-type":"doi"}).text
    
    editors = []
    for author in html.find_all("contrib",{"contrib-type":"editor"}):
        #Author name
        name = " ::: ".join([_.text for _ in author.find("name")])

        #Addresses
        add = []
        for aff in author.find_all("xref"):
            rid = aff.get("rid")
            try:
                add.append(html.find("aff",{"id":rid}).find("addr-line").text)
            except:
                pass #another field

        add = " ::: ".join(add)
        
        editors.append("\t".join([doi,name,add]))
        
    return editors

In [24]:
def process_results(results,f):
    """
    Here fast thing with results (e.g. save to file)
    """
    for line in results:
        f.write(line+"\n")  
            
def create_results(paper):
    """
    Here slow thing with dataframe
    """
    html = bs.BeautifulSoup(open("./data/PLoS_One/{}".format(paper)).read(),"xml")
    if html.find("article").get("article-type") != 'research-article':
        return 0

    try:
        return [write_paper(html),write_editor(html),write_authors(html)]
    except:
        return 1


In [25]:
papers = os.listdir("./data/PLoS_One/")
skipped = 0

with open("./data/papers.csv","w+") as f_paper, \
     open("./data/editors.csv","w+") as f_editor, \
     open("./data/authors.csv","w+") as f_author:
            
    f_paper.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format("doi","type_","published","received","accepted","title","cats"))
    f_author.write("{}\t{}\t{}\t{}\t{}\t{}\n".format("doi","name","orcid","roles","add","corr"))
    f_editor.write("{}\t{}\t{}\n".format("doi","name","add"))


    with Pool() as pool:
        for results in pool.imap_unordered(create_results, papers):
            if isinstance(results,int):
                skipped += results
            else:
                process_results(results[0],f_paper)
                process_results(results[1],f_editor)
                process_results(results[2],f_author)


In [26]:
skipped

53