In [8]:
import bs4 as bs
import os
from multiprocessing import Pool
import pandas as pd

In [2]:
def write_paper(html):
    #ID
    doi = html.find("article-id",{"pub-id-type":"doi"}).text
    #Type
    type_ = html.find("subj-group",{"subj-group-type":"heading"}).text#,{"subj-group-type":"Discipline-v3"})
    #Dates
    published = "/".join([_.text for _ in html.find("pub-date",{"pub-type":"epub"})])
    received = "/".join([_.text for _ in html.find("date",{"date-type":"received"})])
    accepted = "/".join([_.text for _ in html.find("date",{"date-type":"accepted"})])
    
    #Title
    title = html.find("article-title").text.replace("\n"," ").replace("\t"," ").replace("\r"," ")
        
    #categories
    cats = []
    for cat in html.find_all("subj-group", {"subj-group-type":"Discipline-v3"}):
        cats.append(cat.find("subject").text)
        
    cats = " ::: ".join(list(set(cats)))
    return ["\t".join([doi,type_,published,received,accepted,title,cats])]
    

def write_authors(html):
    doi = html.find("article-id",{"pub-id-type":"doi"}).text
    
    authors = []
    for author in html.find_all("contrib",{"contrib-type":"author"}):
        
        #Author name
        try:
            name = " ::: ".join([_.text for _ in author.find("name")])
        except:
            continue #no author, a research group

        #Author ID (if available)
        orcid = author.find("contrib-id",{"contrib-id-type":"orcid"})
        if orcid is not None:
            orcid = orcid.text
        else:
            orcid = ""

        #Roles
        roles = " ::: ".join([_.text for _ in author.find_all("role")])

        #Addresses
        add = []
        corr = "0"
        for aff in author.find_all("xref"):
            rid = aff.get("rid")
            if "cor" in rid:
                corr = "1"
            else:
                
                try:
                    add.append(html.find("aff",{"id":rid}).find("addr-line").text)
                except:
                    pass #other field
                    
                

        add = " ::: ".join(add)

        authors.append("\t".join([doi,name,orcid,roles,add,corr]))
    
    return authors        
        

def write_editor(html):
    doi = html.find("article-id",{"pub-id-type":"doi"}).text
    
    editors = []
    for author in html.find_all("contrib",{"contrib-type":"editor"}):
        #Author name
        name = " ::: ".join([_.text for _ in author.find("name")])

        #Addresses
        add = []
        for aff in author.find_all("xref"):
            rid = aff.get("rid")
            try:
                add.append(html.find("aff",{"id":rid}).find("addr-line").text)
            except:
                pass #another field

        add = " ::: ".join(add)
        
        editors.append("\t".join([doi,name,add]))
        
    return editors

In [3]:
def process_results(results,f):
    """
    Here fast thing with results (e.g. save to file)
    """
    for line in results:
        f.write(line+"\n")  
            
def create_results(paper):
    """
    Here slow thing with dataframe
    """
    html = bs.BeautifulSoup(open("{}{}".format(path,paper)).read(),"xml")
    if html.find("article").get("article-type") != 'research-article':
        return 0

    try:
        return [write_paper(html),write_editor(html),write_authors(html)]
    except:
        return 1


In [4]:
path = "./data_raw/PLoS_One/"
papers = os.listdir(path)

skipped = 0

with open("./data/papers.tsv","w+") as f_paper, \
     open("./data/editors.tsv","w+") as f_editor, \
     open("./data/authors.tsv","w+") as f_author:
            
    f_paper.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format("doi","type_","published","received","accepted","title","cats"))
    f_author.write("{}\t{}\t{}\t{}\t{}\t{}\n".format("doi","name","orcid","roles","add","corr"))
    f_editor.write("{}\t{}\t{}\n".format("doi","name","add"))


    with Pool() as pool:
        for results in pool.imap_unordered(create_results, papers):
            if isinstance(results,int):
                skipped += results
            else:
                process_results(results[0],f_paper)
                process_results(results[1],f_editor)
                process_results(results[2],f_author)


In [5]:
skipped

53

## Add gender

In [2]:
import pandas as pd
import time
import requests
import json

In [21]:
def get_gender(name):
    last,first,*_ = name.split(" ::: ")
    path = "http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname={0}&Lname={1}&format=json".format(first, last)
    r = requests.get(path)
    text = r.text.strip().replace('\'', '"')
    x = json.loads(text)
    gender,ethnic = x['Genni'],x["Ethnea"]
    
    #for testing
    #print("{0} {1}".format(first,last), end=" --> ")
    #print("empirical: {0}".format(empirical))
    
    return gender,ethnic

In [6]:
def gender_main(type_):
    df = pd.read_csv("./data/{}.tsv".format(type_),sep="\t")
    gender_map = dict()
    ethnic_map = dict()
    
    unique_names = df["name"].unique()
    for i,name in enumerate(unique_names):
        if isinstance(name,float):
            continue
        if ((i%1000) == 0):
            print("{:2.2f}".format(i/len(unique_names)*100),end = "-")
            
        try:
            gender_map[name],ethnic_map[name] = get_gender(name)
        except:
            print(name)
            
    return df,gender_map,ethnic_map
    

In [None]:
df,gender_map,ethnic_map = gender_main("editors")

In [22]:
for i in [a for  _ in d.split(" : ") for a in _.split("\n") if ":::" in a]:
    gender_map[i],ethnic_map[i] = get_gender(i)

In [23]:
df["gender"] = df["name"].map(gender_map)
df["ethnicity"] = df["name"].map(ethnic_map)
df.to_csv("./data/editors_gender.tsv",sep="\t",index=None)

In [12]:
authors = pd.read_csv("data/authors.tsv",sep="\t")
ethnicity = pd.read_csv("data/gender_corr.tsv",sep="\t",header=None,names=["name","gender","eth"])
sample = pd.merge(authors,ethnicity)
sample = sample.loc[sample["gender"] != "-"]
sample.to_csv("data/sample.tsv",sep="\t",index=None)
sample.head()

Unnamed: 0,doi,name,orcid,roles,add,corr,gender,eth
29,10.1371/journal.pone.0089948,Childress ::: Anna Rose,,,"Department of Psychiatry, Perelman School of M...",0.0,F,ENGLISH-ITALIAN
30,10.1371/journal.pone.0001506,Childress ::: Anna Rose,,,"Department of Psychiatry, University of Pennsy...",1.0,F,ENGLISH-ITALIAN
31,10.1371/journal.pone.0044556,Childress ::: Anna Rose,,,"Department of Psychiatry, Perelman School of M...",0.0,F,ENGLISH-ITALIAN
32,10.1371/journal.pone.0113256,Childress ::: Anna Rose,,,"Department of Psychiatry, School of Medicine, ...",0.0,F,ENGLISH-ITALIAN
33,10.1371/journal.pone.0104102,Childress ::: Anna Rose,,,Perelman School of Medicine at the University ...,0.0,F,ENGLISH-ITALIAN
