In [1]:
import os
import codecs
import json
import string
import re

from bs4 import BeautifulSoup, NavigableString, Tag

import gender
from gender import getGenders

In [2]:
! pwd

/Users/mariak/Documents/GitHub/geoscience-first-authorship_master


In [3]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

RAW_PAGES_DIR=root+"/pages/"
PARSED_PAGES_DIR=root+"/parsed/"
GUESSED_NAMES_DIR=root+"/guessed/"
NAME_GENDER_DIR=root+"/name_genders/"

# create directories if they do not exist

for d in [RAW_PAGES_DIR,
         PARSED_PAGES_DIR,
         GUESSED_NAMES_DIR,
         NAME_GENDER_DIR]:
    try:
        os.mkdir(d)
    except FileExistsError:
        pass

using root directory: /Users/mariak/Documents/GitHub/geoscience-first-authorship_master


### Article class, Guessed class, and function definitions

In [4]:
# Initialize and define useful functions and
# data structures.

def is_initialed_name(name):
    first_term = name.split(" ")[0]
    if len(first_term) == 0:
        # print("first term len zero. name:", name)
        return False
    return first_term[-1] == "." and first_term[:-1].isupper()

print("test is_initialed_name- True:", is_initialed_name("J. Smith"), ", False:", is_initialed_name("Joe Smith"))

def contains_initialed_name(names):
    for name in names:
        if is_initialed_name(name):
            return True
    return False

print("test contains_initialed_name- True", contains_initialed_name(["J. Smith", "Cat Meowins"]), ", False:", contains_initialed_name(["Joe Smith", "Cat Meowins"]))

def clean_name(name):
    name = name.strip()
    name = re.sub(r'\s', ' ', name)
    if not is_initialed_name(name):
        terms = name.split(" ")
        terms[0] = terms[0].strip(".")
        name = " ".join(terms)
    return name.strip('.')

print("test clean name- Colin. J. Cats:", clean_name("Colin. J. Cats"), "W. B. Easy:", clean_name("W. B. Easy"))

class Article:
    def __init__(self, first_author, all_names, year, month, title, journal):
        # clean the author names
        # - remove non-ascii whitespace
        # - strip bookend whitespace
        # - strip periods from first names if not an initialed name
        
        
        self.first_author = clean_name(first_author)
        self.names = [clean_name(name) for name in all_names]
        self.year = year
        self.month = month
        self.title = title
        self.journal = journal
        
        # create a unique identifier for this article
        self.id = "_".join([journal, year, month, "_".join(title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(first_author)
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        return m
        
        
def article_from_map(article_map):
    return Article(
        first_author=article_map["first_author"],
        all_names=article_map["all_names"],
        year=article_map["year"],
        month=article_map["month"],
        title=article_map["title"],
        journal=article_map["journal"])

test_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)

print("last name set:", test_article.last_name_set())
print("article id:", test_article.id)
print("article has initial:", test_article.has_initials)
print("article map:", test_article.to_map())


class Guessed:
    def __init__(self, primary_article, guessed_name, match_article_id):
        self.first_author = primary_article.first_author
        self.names = primary_article.names
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        
        # create a unique identifier for this article
        self.id = "_".join([self.journal, self.year, self.month, "_".join(self.title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(self.first_author)
        
        self.guessed_name = guessed_name
        self.match_article_id = match_article_id
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        m["guessed_name"] = self.guessed_name
        m["match_article_id"] = self.match_article_id
        return m
        
        
def guessed_from_map(guessed_map):
    g = Guessed(
        primary_article=article_from_map(guessed_map),
        guessed_name=guessed_map["guessed_name"],
        match_article_id=guessed_map["match_article_id"])
    return g

test_primary_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)
test_match_article = Article(
    first_author="cat",
    all_names=["Arthur Cat", "Dog", "Another Cat", "More Cats"],
    year="2018",
    month="05",
    title="existence of cool/cats",
    journal="GeoCatography"
)
test_guessed = Guessed(test_primary_article, "Arthur Cat", test_match_article.id)

print(test_guessed.to_map())
print(guessed_from_map(test_guessed.to_map()).to_map() == test_guessed.to_map())

test is_initialed_name- True: True , False: False
test contains_initialed_name- True True , False: False
test clean name- Colin. J. Cats: Colin J. Cats W. B. Easy: W. B. Easy
last name set: {'Cat', 'Cats', 'Dog'}
article id: GeoCatography_2019_02_a_story_of_coolcats
article has initial: False
article map: {'first_author': 'cat', 'all_names': ['A. Cat', 'Dog', 'Another Cat', 'More Cats'], 'year': '2019', 'month': '02', 'title': 'a story of cool/cats', 'journal': 'GeoCatography', 'id': 'GeoCatography_2019_02_a_story_of_coolcats', 'has_initials': False}
{'first_author': 'cat', 'all_names': ['A. Cat', 'Dog', 'Another Cat', 'More Cats'], 'year': '2019', 'month': '02', 'title': 'a story of cool/cats', 'journal': 'GeoCatography', 'id': 'GeoCatography_2019_02_a_story_of_coolcats', 'has_initials': False, 'guessed_name': 'Arthur Cat', 'match_article_id': 'GeoCatography_2018_05_existence_of_coolcats'}
True


In [7]:
# Page parse functions

def get_parse_function(filename):
    if filename.startswith("JGR") or filename.startswith("GRL") or filename.startswith("G3"):
        return parse_agu_page
    if filename.startswith("Seismological+Research+Letters") or filename.startswith("Bulletin+of+the+Seismological+Society+of+America"):
        return parse_gsw_page
    if filename.startswith("NatureGeoscience") or filename.startswith("Nature"):
        return parse_ng_page
    if filename.startswith("Earth+and+Planetary+Science+Letters") or filename.startswith("Physics+of+the+Earth+and+Planetary+Interiors") or filename.startswith("Tectonophysics"):
        return parse_sd_page
    if filename.startswith("GJI"):
        return parse_gji_page
    if filename.startswith("SolidEarth"):
        return parse_solidearth_page
    if filename.startswith("GEOPHYSICS"):
        return parse_geophysics_page
    if filename.startswith("Science"):
        return parse_science_page
    return None




def parse_agu_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="item__body")
    for a in articles:
        meta_title = a.find_all(class_="meta__title")
        title = meta_title[0].find_all("a", class_="publication_title")
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
       
        if title == "Issue Information":
            print('hit "Issue Information"')
            continue
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue

        authors = []
        author_spans = a.find_all("a", class_="publication_contrib_author")
        for p in author_spans:
            author_span = p.span
            if author_span.i is not None:
                author_span.i.decompose()
            authors.append(str(author_span.string))
   
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue

        article = Article(
                first_author=authors[0],
                all_names=authors,
                year=year,
                month=month,
                title=title,
                journal=journal)
        parsed_articles.append(article)
    return parsed_articles



def parse_gsw_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="wi-fullname")
        for author in author_spans:
            author = str(author.find_all("a")[0].get_text())
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_ng_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("li", class_="pb20")

    for a in articles:
        title_section = a.find_all("h2", class_="h3")
        if title_section[0] is None:
            print("title section is none:", a)
            continue
        title = title_section[0].a.contents[0]
        
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue
        title = title.strip()
        
        
        authors = []
        author_span = a.find_all("ul", class_="js-list-authors-3")
        for auths in author_span:
            for author in auths.find_all("a", class_="js-no-scroll"):
                author = author.contents[0]
                
                if author == "[…]" or "Show fewer authors" in author:
                    continue
                    
                authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)
        
    return parsed_articles

        
def parse_sd_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="result-item-container")
    for a in articles:
        title = a.find_all("a", class_="result-list-title-link")
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())

        if title == "None":
            print("hit NoneType title")
            print("title:", a.find_all("a", class_="result-list-title-link"))
            continue
        if len(title) == 0:
            print("hit empty title")
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="author")
        for author in author_spans:
            author = author.string
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_solidearth_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="paperlist-object")
    for a in articles:
        title = a.find_all("a", class_="article-title") 
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue

        
        authors = []
        try:
            author_spans = a.find_all("div", class_="authors")[0]
        except IndexError:
            print("hit empty authors.")
            continue
    
        for author in author_spans.string.split(","):
            author = str(author).strip()
            if author[0:3] == "and":
                author = author[3:].strip()            
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_geophysics_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="issue-item__body")
    for a in articles:
        title = a.find_all(class_="issue-item__title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
            
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue
        
        authors = []
        author_spans = a.find("div", class_="issue-item__authors")
        a = author_spans.find_all("a")
        for auth in a:
            author = str(auth.string)
            authors.append(author)
        
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_gji_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue        
        
        authors = []
        author_spans = a.find_all("div", class_="sri-authors al-authors-list")
        for au_span in author_spans:
            aus = au_span.find_all("a")
            for au in aus:
                authors.append(str(au.get_text()))
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_science_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="results-cit")

    for a in articles:
        title_span = a.find("span",  class_="cit-first-element")
        title = str(title_span.text)
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="cit-auth")
        for au_span in author_spans:
            au = au_span.text.strip()
            if au != "and":
                authors.append(au)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)


    return parsed_articles

#### Create one json per article with article info, stored in parsed/ 

In [8]:
# Walk directory, parse, and save parsed articles

for _, _, files in os.walk(RAW_PAGES_DIR):
    for file in files:
        
#         # CHANGE THIS TO FILTER FOR SPECIFIC JOURNALS
        #if (file.startswith("JGR") or file.startswith("GRL") or file.startswith("G3")): 
        #    continue
        
        parser = get_parse_function(file)
        if parser is None:
            print("got None parse function, skipping file:", file)
            continue
        #print("processing file:", file)
        print(file[0:3], end=",")
        journal, year, month, _ = file.split("_")

        html = ""
        with codecs.open(RAW_PAGES_DIR+file, "r", "utf8") as infile:
            html = infile.read()
        
        soup = BeautifulSoup(html, "html.parser")
        
        for article in parser(soup, year, month, journal):
            outfile_name = article.id[:80]+".json"
            with codecs.open(PARSED_PAGES_DIR+outfile_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(article.to_map()))
            
print("Done!")

GRL,Phy,Phy,JGR,G3_,GRL,Bul,G3_,Sei,GEO,Phy,Sei,Bul,JGR,GJI,Bul,GJI,G3_,GRL,JGR,JGR,Bul,GJI,GRL,G3_,Sei,G3_,JGR,Bul,GRL,JGR,Sei,GEO,Sei,Sei,G3_,G3_,Ear,Ear,JGR,JGR,JGR,Bul,GJI,GJI,GJI,Sei,GEO,GJI,Tec,G3_,Sei,Tec,GJI,JGR,GRL,Sol,JGR,Bul,GJI,JGR,Bul,G3_,GJI,JGR,GRL,G3_,GEO,JGR,Nat,GRL,Nat,JGR,GEO,GJI,JGR,Sol,Sol,Sei,JGR,GJI,Ear,Ear,GJI,G3_,GRL,JGR,GEO,JGR,Sei,JGR,Bul,GRL,GEO,JGR,GJI,Bul,JGR,GEO,JGR,Tec,JGR,GJI,GRL,GJI,G3_,Bul,Sei,JGR,GJI,JGR,GRL,G3_,GJI,Sei,hit empty authors. title: New Publications
JGR,Sol,Bul,GEO,GRL,G3_,Sei,GRL,Sol,Bul,GJI,JGR,Nat,Nat,Bul,GRL,JGR,GRL,GJI,G3_,Sei,G3_,GRL,GEO,Sei,GRL,JGR,G3_,Sei,Bul,GJI,Bul,JGR,G3_,Bul,Sei,GRL,GJI,JGR,GRL,Bul,G3_,JGR,GJI,GEO,Sei,hit empty authors. title: New Publications
GRL,GEO,Bul,Sci,Sci,JGR,GJI,JGR,GJI,JGR,JGR,GJI,GEO,GJI,Sei,GEO,JGR,JGR,Bul,GRL,Sei,Bul,GJI,GJI,G3_,JGR,GEO,G3_,GRL,GJI,Bul,GJI,Bul,Sei,GJI,JGR,JGR,GRL,Sei,JGR,GRL,G3_,GJI,Tec,Tec,JGR,GRL,GRL,Sei,Sei,Bul,GEO,GEO,Bul,GRL,GRL,JGR,Bul,Sei,G3_,GRL,GEO,G3_,Bul,Sei,Bul,Sei,GR

G3_,GRL,Sci,Bul,Sei,Sci,GEO,JGR,GEO,JGR,GJI,Nat,Bul,Nat,Sol,GJI,Bul,Sei,GRL,GEO,JGR,GRL,G3_,JGR,Sei,GEO,GRL,Sei,Bul,JGR,G3_,GJI,GRL,GJI,JGR,Sei,JGR,JGR,JGR,GJI,G3_,GJI,Bul,GJI,GJI,Sei,Tec,Tec,Bul,JGR,Bul,GEO,GEO,GJI,G3_,Bul,GJI,JGR,JGR,JGR,GJI,JGR,JGR,Bul,GRL,GEO,GEO,GRL,G3_,GJI,JGR,Sol,Sol,Done!


#### Start with the name --> gender: test genderize.io api

In [9]:
# class for gender result and test
class GenderResult(object):
    def __init__(self, name, result):
        self.name = name
        self.binary = result[0]
        self.percent = result[1]
        self.count = result[2]
        
    def to_map(self):
        m = {}
        m["name"] = self.name
        m["binary"] = self.binary
        m["percent"] = self.percent
        m["count"] = self.count
        return m
    

def gender_result_from_map(m):
    return GenderResult(m["name"], ( m["binary"], m["percent"], m["count"]))

name = "Taylor"
g = getGenders(name)[0]
gr = GenderResult(name, g)
gender_result_from_map(json.loads(json.dumps(gr.to_map()))).to_map()

{'name': 'Taylor', 'binary': 'male', 'percent': 0.72, 'count': 3370}

#### read all the previously determined and stored name --> gender pairs

In [10]:
# read the Gender API gender guesses into memory

gender_results = []

for _, _, files in os.walk(NAME_GENDER_DIR):
    for file in files:
        with codecs.open(NAME_GENDER_DIR+file, "r", "utf8") as infile:
            gr = gender_result_from_map(json.loads(infile.read()))
            gender_results.append(gr)
            
print("We have *{}* stored name informations in\n{}\nfrom previous runs.".format(len(gender_results), NAME_GENDER_DIR))
        
    
# Create a map of first name to gender result
name_to_gr = {}
for gr in gender_results:
    name_to_gr[gr.name] = gr
    
print("Length of current name-gender map", len(name_to_gr))

# test
print("Colin." in name_to_gr)
print("First name in current gender map, gender: ", \
      list(name_to_gr)[0], name_to_gr[list(name_to_gr)[0]].binary)
print("Last name in current gender map, gender: ", \
      list(name_to_gr)[-1], name_to_gr[list(name_to_gr)[-1]].binary)

We have *3938* stored name informations in
/Users/mariak/Documents/GitHub/geoscience-first-authorship_master/name_genders/
from previous runs.
Length of current name-gender map 3938
False
First name in current gender map, gender:  Jian‐Zhi None
Last name in current gender map, gender:  Junhyeon male


#### Determine how many papers have initialed authors, and try to guess their names by comparing to non-initialed authors

In [11]:
# How many documents have initialed first authors
count = 0
for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not art.has_initials:
            continue
        count += 1
        
print("{} papers have initialed author names.".format(count))

# Create in-memory map for name guessing

# This map is generated from the articles where names are not initialed.
# The keys on the map are last names. The values are arrays of articles
# where at least one author on the article has the keyed last name.


## TODO: This could be improved. last_names_to_articles dictionary should also
# contain the last names of the coauthors.

last_names_to_articles = {}

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
        #last_name = art.first_author.split(" ")[-1]
        #if last_name not in last_names_to_articles.keys():
        #    last_names_to_articles[last_name] = []
        #last_names_to_articles[last_name].append(art)
        for name in art.names:
            last_name = name.split(" ")[-1]
            if last_name not in last_names_to_articles.keys():
                last_names_to_articles[last_name] = []
            last_names_to_articles[last_name].append(art)
        
print("All last names, map size", len(last_names_to_articles.keys()))

3169 papers have initialed author names.
All last names, map size 12355


In [11]:
# Create guesses for first author names and save to files
def extract_name_guess(initial_name, possible_names):
    last_name = initial_name.split(" ")[-1]
    for pn in possible_names:
        if last_name != pn.split(" ")[-1]:
            continue  # not the name we're looking for
        if initial_name[0] != pn[0]:
            continue  # first letters do not match
        if len(initial_name.split()) > 2 and len(pn.split()) > 2:
            # both names have middle initial
            if initial_name.split()[1] != pn.split()[1]:
                # middle initial does not fit
                continue
        if len(initial_name.split("-")) > 1 and len(pn.split("-")) > 1:
            # both names have hyphen
            if initial_name.split("-")[1][0] != pn.split("-")[1][0]:
                # second part of hyphenated name does not fit
                continue
        if is_initialed_name(pn):
            continue
        return pn, True
    return "", False

# create an output text file to quickly check if any bullshit occurs
check_file = open("output_checklist_guessednames.txt", "w")

# create a list that keeps track of guesses to flag ambiguity
all_guesses = {}

# unsupported edge cases
# Check that the same initialized name
# is not mapped to multiple different
# complete names
# J. M. Li => Jia Li
# J. M. Li => Jiaxun Li
# J. M. Li => Jingyuan Li


unmapped_names = set()

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not art.has_initials:
            continue
        last_name = art.first_author.split(" ")[-1]
        if last_name not in last_names_to_articles.keys():
            unmapped_names.add(art.first_author)
            continue
        articles = last_names_to_articles[last_name]
        
        # gather all the guesses, paired with the size of overlap.
        # guesses with largest overlap will be written to file. 
        guesses = []
        for article in articles:
            overlap = article.last_name_set() & art.last_name_set()
            if len(overlap) < 2:
                continue  # skip articles without enough overlap
            
            name, ok = extract_name_guess(art.first_author, article.names)
            if not ok:
                continue
            
            try:
                previous_guess = all_guesses[art.first_author]
                if previous_guess != name:
                    print("Detected ambiguous guess: {} - {} - {}".format(
                         art.first_author, previous_guess, name))
                    print(art.title)
                    # we could continue at this point, but we already have one of these names
                    # written in a json (previous_guess). Not sure what to do, let's discuss
            except KeyError:
                pass
            check_file.write("{}\t\t{}\n".format(name, art.first_author))
            guessed = Guessed(art, name, article.id)
            guesses.append((len(overlap), guessed))
            all_guesses[art.first_author] = name
            
        if len(guesses) == 0:
            continue
        
        guesses = sorted(guesses, key=lambda x: x[0], reverse=True) #This does not change guesses if we do not assign it to variable guesses
        the_guess = guesses[0][1]
             
        outfile_name = the_guess.id[:80]+".json"
        with codecs.open(GUESSED_NAMES_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(the_guess.to_map()))
check_file.close()
len(unmapped_names)

Detected ambiguous guess: L. Gualtieri - L Gualtieri - Lucia Gualtieri
On the shaping factors of the secondary microseismic wavefield
Detected ambiguous guess: G. T. Schuster - Gerard T. Schuster - Gerard Schuster
Theory of multisource crosstalk reduction by phase-encoded statics
Detected ambiguous guess: G. T. Schuster - Gerard Schuster - Gerard T. Schuster
Theory of multisource crosstalk reduction by phase-encoded statics
Detected ambiguous guess: G. T. Schuster - Gerard T. Schuster - Gerard Schuster
Theory of multisource crosstalk reduction by phase-encoded statics
Detected ambiguous guess: D.A. Rhoades - David Rhoades - David A. Rhoades
Long-range predictability in physics-based synthetic earthquake catalogues
Detected ambiguous guess: D.A. Rhoades - David A. Rhoades - David Rhoades
Long-range predictability in physics-based synthetic earthquake catalogues
Detected ambiguous guess: D.A. Rhoades - David Rhoades - David A. Rhoades
Long-range predictability in physics-based synthetic 

Detected ambiguous guess: J. Kinscher - J L Kinscher - Jannes Kinscher
Resolving source mechanisms of microseismic swarms induced by solution mining
Detected ambiguous guess: C. W. D. Milliner - Christopher Milliner - Christopher W. D. Milliner
Comparison of coseismic near‐field and off‐fault surface deformation patterns of the 1992 Mw 7.3 Landers and 1999 Mw 7.1 Hector Mine earthquakes: Implications for controls on the distribution of surface strain
Detected ambiguous guess: V. K. Karastathis - Vassilios Karastathis - Vassilis Karastathis
High-precision relocation of seismic sequences above a dipping Moho:  the case of the January–February 2014 seismic sequence on Cephalonia island (Greece)
Detected ambiguous guess: V. K. Karastathis - Vassilis Karastathis - Vassilios Karastathis
High-precision relocation of seismic sequences above a dipping Moho:  the case of the January–February 2014 seismic sequence on Cephalonia island (Greece)
Detected ambiguous guess: M. M. Mendoza - Manuel Mend

Detected ambiguous guess: N. A. Ruppert - Natalia A. Ruppert - Natalia Ruppert
Complex Faulting and Triggered Rupture During the 2018 MW 7.9 Offshore Kodiak, Alaska, Earthquake
Detected ambiguous guess: N. A. Ruppert - Natalia Ruppert - Natalia A. Ruppert
Complex Faulting and Triggered Rupture During the 2018 MW 7.9 Offshore Kodiak, Alaska, Earthquake
Detected ambiguous guess: N. A. Ruppert - Natalia A. Ruppert - Natalia Ruppert
Complex Faulting and Triggered Rupture During the 2018 MW 7.9 Offshore Kodiak, Alaska, Earthquake
Detected ambiguous guess: M. Guidarelli - M Guidarelli - Mariangela Guidarelli
Ambient noise tomography of the Cameroon Volcanic Line and Northern Congo craton: new constraints on the structure of the lithosphere
Detected ambiguous guess: M. K. Young - Mallory K. Young - Mallory Young
Global P wave tomography of Earth's lowermost mantle from partition modeling
Detected ambiguous guess: M. K. Young - Mallory Young - Mallory K. Young
Global P wave tomography of Earth

Detected ambiguous guess: P. Supendi - Pepen Supendi - P Supendi
Fate of Forearc Lithosphere at Arc‐Continent Collision Zones: Evidence From Local Earthquake Tomography of the Sunda‐Banda Arc Transition, Indonesia
Detected ambiguous guess: P. Supendi - P Supendi - Pepen Supendi
Fate of Forearc Lithosphere at Arc‐Continent Collision Zones: Evidence From Local Earthquake Tomography of the Sunda‐Banda Arc Transition, Indonesia
Detected ambiguous guess: J. S. Buehler - Janine Buehler - Janine S. Buehler
T phase observations in global seismogram stacks
Detected ambiguous guess: J. S. Buehler - Janine S. Buehler - Janine Buehler
T phase observations in global seismogram stacks
Detected ambiguous guess: C. Zhang - Chuanlun Zhang - Caihong Zhang
Discovery of Mega‐Sheath Folds Flooring the Liwan Subbasin (South China Sea): Implications for the Rheology of Hyperextended Crust
Detected ambiguous guess: C. Zhang - Caihong Zhang - Chengyuan Zhang
Discovery of Mega‐Sheath Folds Flooring the Liwan Su

Detected ambiguous guess: N. M. Shapiro - Nikolai Shapiro - Nikolaï M. Shapiro
Deep and shallow long-period volcanic seismicity linked by fluid-pressure transfer
Detected ambiguous guess: N. M. Shapiro - Nikolaï M. Shapiro - Nikolai M. Shapiro
Deep and shallow long-period volcanic seismicity linked by fluid-pressure transfer
Detected ambiguous guess: N. M. Shapiro - Nikolai M. Shapiro - Nikolay M. Shapiro
Deep and shallow long-period volcanic seismicity linked by fluid-pressure transfer
Detected ambiguous guess: N. M. Shapiro - Nikolay M. Shapiro - Nikolay Shapiro
Deep and shallow long-period volcanic seismicity linked by fluid-pressure transfer
Detected ambiguous guess: N. M. Shapiro - Nikolay Shapiro - NM Shapiro
Deep and shallow long-period volcanic seismicity linked by fluid-pressure transfer
Detected ambiguous guess: S. E. K. Bennett - Scott E. K. Bennett - Scott Bennett
Paleoseismic Results from the Alpine Site, Wasatch Fault Zone: Timing and Displacement Data for Six Holocene Ea

Detected ambiguous guess: N. Rawlinson - N Rawlinson - Nick Rawlinson
Multipathing, reciprocal traveltime fields and raylets
Detected ambiguous guess: A. Avallone - Antonio Avallone - A Avallone
Very high rate (10 Hz) GPS seismology for moderate‐magnitude earthquakes: The case of the Mw 6.3 L'Aquila (central Italy) event
Detected ambiguous guess: J.-P. Montagner - Jean‐Paul Montagner - Jean-Paul Montagner
Time-reversal method and cross-correlation techniques by normal mode theory: a three-point problem
Detected ambiguous guess: E. Saygin - E Saygin - Erdinc Saygin
Retrieval of Interstation Local Body Waves From Teleseismic Coda Correlations
Detected ambiguous guess: S. C. Singh - Satish Singh - Satish C. Singh
Enhanced reflectivity of backthrusts in the recent great Sumatran earthquake rupture zones
Detected ambiguous guess: P. Gasperini - Paolo Gasperini - P Gasperini
The Location and Sizing of Historical Earthquakes Using the Attenuation of Macroseismic Intensity with Distance
Detect

Detected ambiguous guess: P. Tong - Ping Tong - P Tong
Wave-equation-based travel-time seismic tomography – Part 2: Application to the 1992 Landers earthquake (Mw 7.3) area
Detected ambiguous guess: P. Tong - P Tong - Ping Tong
Wave-equation-based travel-time seismic tomography – Part 2: Application to the 1992 Landers earthquake (Mw 7.3) area
Detected ambiguous guess: V. M. Cruz‐Atienza - Víctor M. Cruz‐Atienza - Victor Cruz‐Atienza
Size of Popocatepetl Volcano explosions (1997–2001) from waveform inversion
Detected ambiguous guess: V. M. Cruz‐Atienza - Victor Cruz‐Atienza - Víctor M. Cruz‐Atienza
Size of Popocatepetl Volcano explosions (1997–2001) from waveform inversion
Detected ambiguous guess: V. M. Cruz‐Atienza - Víctor M. Cruz‐Atienza - Víctor Cruz‐Atienza
Size of Popocatepetl Volcano explosions (1997–2001) from waveform inversion
Detected ambiguous guess: L. G. Evers - Läslo G. Evers - Läslo Evers
Evanescent wave coupling in a geophysical system: Airborne acoustic signals from 

Detected ambiguous guess: S. Henrys - Stuart Henrys - Stuart A. Henrys
SAHKE geophysical transect reveals crustal and subduction zone structure at the southern Hikurangi margin, New Zealand
Detected ambiguous guess: L. M. Wallace - Laura Wallace - Laura M. Wallace
The kinematics of a transition from subduction to strike‐slip: An example from the central New Zealand plate boundary
Detected ambiguous guess: L. M. Wallace - Laura M. Wallace - Laura Wallace
The kinematics of a transition from subduction to strike‐slip: An example from the central New Zealand plate boundary
Detected ambiguous guess: L. M. Wallace - Laura Wallace - Laura M. Wallace
The kinematics of a transition from subduction to strike‐slip: An example from the central New Zealand plate boundary
Detected ambiguous guess: L. M. Wallace - Laura M. Wallace - Laura Wallace
The kinematics of a transition from subduction to strike‐slip: An example from the central New Zealand plate boundary
Detected ambiguous guess: L. M. Wallac

Detected ambiguous guess: J. Sarout - Joël Sarout - Joel Sarout
Laboratory micro-seismic signature of shear faulting and fault slip in shale
Detected ambiguous guess: J. Sarout - Joel Sarout - Joël Sarout
Laboratory micro-seismic signature of shear faulting and fault slip in shale
Detected ambiguous guess: J. Sarout - Joël Sarout - Joel Sarout
Laboratory micro-seismic signature of shear faulting and fault slip in shale
Detected ambiguous guess: T. A. Little - Timothy A. Little - Timothy Little
Kekerengu Fault, New Zealand: Timing and Size of Late Holocene Surface Ruptures
Detected ambiguous guess: T. A. Little - Timothy Little - Timothy A. Little
Kekerengu Fault, New Zealand: Timing and Size of Late Holocene Surface Ruptures
Detected ambiguous guess: T. A. Little - Timothy A. Little - Tim Little
Kekerengu Fault, New Zealand: Timing and Size of Late Holocene Surface Ruptures
Detected ambiguous guess: J. F. Clinton - John F. Clinton - John Clinton
Preparing for InSight: An Invitation to 

471

#### Step 1: Genderize the names GUESSED from initials
#### - collect the list of first names
#### - check if it is in the list already, if not: call genderize io
#### - finally check how many in the guessed initialed names are male / female

In [12]:
# Collect list of all GUESSED names to be gendered

guessed_first_names = set()


## TODO:  Adapt here for all authors instead for first author only

for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            guess = guessed_from_map(json.loads(infile.read()))
        first_name = guess.guessed_name.split(" ")[0]
        if len(first_name) == 0:
            print("empty first name?:", guess.guessed_name)
            continue
        guessed_first_names.add(first_name)
        
print("number of unique first names to guess:", len(guessed_first_names))
print("From {} to {}.".format(list(guessed_first_names)[0],
                              list(guessed_first_names)[-1]))        

number of unique first names to guess: 756
From Guiping to Antonio.


In [13]:
# Call the Gender API and save output for all the GUESSED names
# WARN: Makes API calls
# Does nothing if the guessed names were genderized previously


for i, name in enumerate(guessed_first_names):
    # print(i, end=",")
    if name in name_to_gr:
        continue
    else:
        result = getGenders(name)
        if len(result) > 1:
            print("long result:", result)
        r = result[0]
        gr = GenderResult(name, r)
        
        # update gender name map in memory
        name_to_gr[gr.name] = gr
        
        # also save for later use
        file_name = "_".join(name.translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
        with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(gr.to_map()))



In [14]:
# Determine how many male/female/unknown names exist in the guessed names
# NOTE: This ratio should be for all guesses, not the reduced set() of names

binary_ratio_first_names = {'male':0, 'female':0, 'None':0}

for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:
#         if not file.startswith("JGROceans"):
#             continue
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            guess = guessed_from_map(json.loads(infile.read()))
        first_name = guess.guessed_name.split(" ")[0]
        if len(first_name) == 0:
            print("empty first name?:", guess.guessed_name)
            continue
        try:
            gr = name_to_gr[first_name]
            binary_ratio_first_names[gr.binary] += 1
        except KeyError:
            print("not found in map:", first_name)
        
        
binary_ratio_first_names

{'male': 1099, 'female': 400, 'None': 64}

#### Now genderize the names that were given in full
#### - collect the list of first names
#### - check if name is in the list already, if not: call genderize io

In [15]:
# Create a set of first names from all scraped journals
# For Non-Initialed names

all_first_names = set()
all_names = set()


for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
        first_name = art.first_author.split(" ")[0]
        all_first_names.add(first_name)
        
        for allname in art.names:
            all_names.add(allname.split()[0])
        
        
print("first name set size", len(all_first_names))
print("all author name set size", len(all_names))


first name set size 4261
all author name set size 8508


In [17]:
# Call the gender API for all the non-initialed first names. Deduped against
# existing files in the map name_to_gr. Be sure to run the cell
# that loads that map before running this function. When properly
# loaded, this cell can be re-run until all names are handled.

# TODO : Also run this on ALL author names!

print("starting size:", len(name_to_gr))

handled = 0

for name in all_first_names:
    if name in name_to_gr:
        continue
        
    handled += 1
    
    result = getGenders(name)
    if len(result) > 1:
        print("long result:", result)
    r = result[0]
    gr = GenderResult(name, r)
    
    # update map currently in memory
    name_to_gr[name] = gr
    
    # and save for later use
    file_name = "_".join(name.translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
    with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
        outfile.write(json.dumps(gr.to_map()))
        
    if handled > 970:
        print("hit request limit")
        break
        
    
print("finish size:", len(name_to_gr))

starting size: 2991
hit request limit
finish size: 3962


#### Finally, evaluate for all journals / each journal / journal per year..
#### Here, let's create a json database that can be used in another notebook.

In [18]:
# Go both through the guessed names and the non-initialed names.
# Find genders on the base of the name_to_gr map and save the results

# While we do not yet have a name database large enough to go through all the names,
# just go through a few for testing purposes:
nfile = 100

AUTHOR_GENDER_DIR = root + "/author_genders/"
if not os.path.exists(AUTHOR_GENDER_DIR):
    os.mkdir(AUTHOR_GENDER_DIR)

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files[0:nfile]:       
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
            
        all_names = art.names
        all_genders = []
        all_percent = []

        for n in all_names:
            n = n.split()[0]
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(name_to_gr[n].percent)
            except KeyError:
                print("Name {} is not gendered yet".format(n))
                all_genders.append(None)
                all_percent.append(None)
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
        #print(art_out["all_names"])
        print(art_out["all_genders"])
       
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))
# now do the guessed names
for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files[0: nfile]:       
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
            
        all_names = art.names
        all_genders = []
        all_percent = []

        for n in all_names:
            n = n.split()[0]
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(name_to_gr[n].percent)
            except KeyError:
                print("Name {} is not gendered yet".format(n))
                all_genders.append(None)
                all_percent.append(None)
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
        print(art_out["all_genders"])
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))

['male', 'male', 'male', 'male']
Name Wilhelm is not gendered yet
['female', 'None', None, 'male']
Name Shu-Huei is not gendered yet
['male', 'female', None]
['male', 'female', 'male', 'male', 'male']
Name Recai is not gendered yet
['female', 'male', None]
Name Sigurjón is not gendered yet
['male', None, 'male', 'female']
['male', 'male', 'male', 'female', 'male', 'male', 'female', 'male', 'female', 'male']
['male', 'male']
['male', 'female', 'male']
Name Hongbin is not gendered yet
['male', None, 'male', 'male', 'male']
['male', 'male', 'male', 'male']
['male', 'male', 'male']
['male', 'male', 'male', 'male']
Name Constantine is not gendered yet
['male', 'male', None, 'female']
['male', 'male', 'male', 'male']
Name G is not gendered yet
['male', 'male', None, 'male', 'male']
Name R is not gendered yet
['male', 'female', 'male', 'male', None]
['male', 'male', 'male', 'male']
Name Ghislain is not gendered yet
Name Regis is not gendered yet
['male', None, None, 'male']
['male', 'male', '

In [19]:
# the following part may be moved to a new notebook. It is from the analysis of Pico et al. 2020-----

In [20]:
# Determine how many male/female/unknown names exist in the guessed names
# printed out for each journal.


journal_names = [
"Tectonophysics",
"Physics+of+the+Earth+and+Planetary+Interiors",
"Earth+and+Planetary+Science+Letters",
"SolidEarth",
"GEOPHYSICS",
"NatureGeoscience",
"GRL",
"JGRSolidEarth",
"G3",
"GJI",
"Nature",
"Bulletin+of+the+Seismological+Society+of+America",
"Seismological+Research+Letters"]


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    for _, _, files in os.walk(GUESSED_NAMES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
                guess = guessed_from_map(json.loads(infile.read()))
            first_name = guess.guessed_name.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", guess.guessed_name)
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names)

Tectonophysics {'male': 31, 'female': 11, 'None': 1}
Physics+of+the+Earth+and+Planetary+Interiors {'male': 11, 'female': 6, 'None': 0}
Earth+and+Planetary+Science+Letters {'male': 46, 'female': 10, 'None': 1}
SolidEarth {'male': 27, 'female': 8, 'None': 2}
GEOPHYSICS {'male': 7, 'female': 0, 'None': 0}
NatureGeoscience {'male': 17, 'female': 5, 'None': 2}
GRL {'male': 211, 'female': 77, 'None': 12}
JGRSolidEarth {'male': 319, 'female': 135, 'None': 27}
G3 {'male': 58, 'female': 38, 'None': 1}
GJI {'male': 188, 'female': 62, 'None': 9}
Nature {'male': 22, 'female': 6, 'None': 2}
Bulletin+of+the+Seismological+Society+of+America {'male': 95, 'female': 25, 'None': 6}
Seismological+Research+Letters {'male': 71, 'female': 18, 'None': 2}


In [21]:
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
            first_name = article.first_author.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                initialed_name_count += 1
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names, "initialed name count:", initialed_name_count)

not found in map: Rosaria
not found in map: Stathis
not found in map: Zhipeng
not found in map: Zhouchuan
not found in map: Guoyan
not found in map: Yaotian
not found in map: Obi
not found in map: Lu
not found in map: Qingbao
not found in map: Yu-Min
not found in map: Sujit
not found in map: Javed
not found in map: Shashwat
not found in map: Kristy
not found in map: Zhuqi
not found in map: Youichiro
not found in map: Dezheng
not found in map: Osvanny
not found in map: Javed
not found in map: Azad
not found in map: Dayong
not found in map: Guoyan
not found in map: Javed
not found in map: Zhouchuan
not found in map: Ivica
not found in map: Stathis
not found in map: Anil
not found in map: Ruiqing
not found in map: Xiaojie
not found in map: Mor
not found in map: Qingbao
not found in map: Lu
not found in map: Saeko
not found in map: Mirjam
Tectonophysics {'male': 572, 'female': 204, 'None': 63} initialed name count: 197
not found in map: Endra
not found in map: Cigdem
not found in map: Zhou

not found in map: Tadahiro
not found in map: Ye
not found in map: Agata
not found in map: Lu
not found in map: Kangchen
not found in map: Josh
not found in map: Yu‐Min
not found in map: Ziyadin
not found in map: Jintuan
not found in map: Yuanjin
not found in map: Jiahang
not found in map: Lu
not found in map: Ruiqing
not found in map: Yosuke
not found in map: Ching‐Wen
not found in map: Ye
not found in map: Daeyeong
not found in map: Rosaria
not found in map: Doyeon
not found in map: Ekaterina
not found in map: Ye
not found in map: Doyeon
not found in map: Shaoyang
not found in map: Yi‐Rong
not found in map: Zhouchuan
not found in map: Liming
not found in map: Atsuki
not found in map: Hirokazu
not found in map: Kaoru
not found in map: Chuanming
not found in map: Yun
not found in map: Mei
not found in map: Alex
not found in map: Hector
not found in map: Allen
not found in map: Malte
not found in map: Yosuke
not found in map: Yongsheng
not found in map: Rodolfo
not found in map: Michihar

not found in map: Chi-Chia
not found in map: Xu
not found in map: Peidong
not found in map: Paulo
not found in map: Jidong
not found in map: X
not found in map: Philip
not found in map: Mei
not found in map: Ferdinando
not found in map: Weiming
not found in map: Kees
not found in map: Dorianne
not found in map: Özcan
not found in map: Youzuo
not found in map: Jidong
not found in map: Yaotian
not found in map: G
not found in map: Rebekka
not found in map: Rıdvan
not found in map: Jiri
not found in map: Leon
not found in map: Youzuo
not found in map: Stephany
not found in map: G
not found in map: Peidong
GJI {'male': 1561, 'female': 442, 'None': 112} initialed name count: 505
not found in map: Hyunwoo
not found in map: Karin
not found in map: Denghai
not found in map: Youichiro
not found in map: Marcia
Nature {'male': 150, 'female': 44, 'None': 7} initialed name count: 44
not found in map: Guochen
not found in map: Sijia
not found in map: Bei
not found in map: Oluwaseyi
not found in map:

In [22]:
# Same as above but split into year/month bins
#
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.

journal_tallys = {}

for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0, 'initialed':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
                
            tally = journal_tallys.setdefault(journal, {}).setdefault(article.year, {}).setdefault(article.month, {'male':0, 'female':0, 'None':0, 'initialed':0})
            
            first_name = article.first_author.strip().split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                tally['initialed'] += 1
                continue
            try:
                gr = name_to_gr[first_name]
                tally[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


for journal, values in journal_tallys.items():
    print(journal, values)

not found in map: Rosaria
not found in map: Stathis
not found in map: Zhipeng
not found in map: Zhouchuan
not found in map: Guoyan
not found in map: Yaotian
not found in map: Obi
not found in map: Lu
not found in map: Qingbao
not found in map: Yu-Min
not found in map: Sujit
not found in map: Javed
not found in map: Shashwat
not found in map: Kristy
not found in map: Zhuqi
not found in map: Youichiro
not found in map: Dezheng
not found in map: Osvanny
not found in map: Javed
not found in map: Azad
not found in map: Dayong
not found in map: Guoyan
not found in map: Javed
not found in map: Zhouchuan
not found in map: Ivica
not found in map: Stathis
not found in map: Anil
not found in map: Ruiqing
not found in map: Xiaojie
not found in map: Mor
not found in map: Qingbao
not found in map: Lu
not found in map: Saeko
not found in map: Mirjam
not found in map: Endra
not found in map: Cigdem
not found in map: Zhouchuan
not found in map: Rakesh
not found in map: Amin
not found in map: Endra
not 

not found in map: Duo
not found in map: Yabo
not found in map: Yihe
not found in map: Changrong
not found in map: Xu
not found in map: Peidong
not found in map: Becky
not found in map: Youyi
not found in map: Tadafumi
not found in map: Rachael
not found in map: Kaoru
not found in map: Alizia
not found in map: Caglar
not found in map: Linlin
not found in map: Noel
not found in map: Jianping
not found in map: Dipankar
not found in map: Xiaoming
not found in map: Long
not found in map: Christelle
not found in map: Victor
not found in map: Qiankun
not found in map: Yihe
not found in map: Zhouchuan
not found in map: Ahyi
not found in map: Niloufar
not found in map: Hanae
not found in map: Maor
not found in map: Jidong
not found in map: Zhenyu
not found in map: Semechah
not found in map: Rodolfo
not found in map: Zhouchuan
not found in map: Hongxiang
not found in map: Ezgi
not found in map: Kaoru
not found in map: Dominic
not found in map: Rashed
not found in map: Shu‐Hao
not found in map: L

not found in map: Ricky
not found in map: Gerardo
not found in map: Javed
not found in map: Masumi
not found in map: Chih‐Hsuan
not found in map: Xiaodan
not found in map: Samira
not found in map: Shuang‐Lan
not found in map: Amin
not found in map: Guo‐jiao
not found in map: Roxane
not found in map: Rakesh
not found in map: Walker
not found in map: Philip
not found in map: Gerardo
not found in map: Shuang‐Lan
not found in map: Margaret
not found in map: Dimitar
not found in map: Samira
not found in map: Semechah
not found in map: Xiaodan
not found in map: Ting
not found in map: Ryota
not found in map: Karin
not found in map: Bei
not found in map: Seok
not found in map: Rui‐Guang
not found in map: Alon
not found in map: Jingwei
not found in map: Endra
not found in map: Victor
not found in map: Xinzheng
not found in map: Gevorg
not found in map: Annabel
not found in map: Rolf
not found in map: Xiao‐Lei
not found in map: Spahr
not found in map: Kuei‐Pao
not found in map: Ting
not found in