In [1]:
import os
import codecs
import json
import string
import re

from bs4 import BeautifulSoup, NavigableString, Tag

import gender
from gender import getGenders

In [2]:
! pwd

/home/naiara/Git_repositories/geoscience-first-authorship


In [3]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

RAW_PAGES_DIR=root+"/pages/"
PARSED_PAGES_DIR=root+"/parsed/"
GUESSED_NAMES_DIR=root+"/guessed/"
NAME_GENDER_DIR=root+"/name_genders/"

using root directory: /home/naiara/Git_repositories/geoscience-first-authorship


In [4]:
if not os.path.exists(PARSED_PAGES_DIR):
    os.mkdir(PARSED_PAGES_DIR)
    
if not os.path.exists(GUESSED_NAMES_DIR):
    os.mkdir(GUESSED_NAMES_DIR)
    
if not os.path.exists(NAME_GENDER_DIR):
    os.mkdir(NAME_GENDER_DIR)

In [5]:
# Initialize and define useful functions and
# data structures.

def is_initialed_name(name):
    first_term = name.split(" ")[0]
    if len(first_term) == 0:
        # print("first term len zero. name:", name)
        return False
    return first_term[-1] == "." and first_term[:-1].isupper()

print("test is_initialed_name- True:", is_initialed_name("J. Smith"), ", False:", is_initialed_name("Joe Smith"))

def contains_initialed_name(names):
    for name in names:
        if is_initialed_name(name):
            return True
    return False

print("test contains_initialed_name- True", contains_initialed_name(["J. Smith", "Cat Meowins"]), ", False:", contains_initialed_name(["Joe Smith", "Cat Meowins"]))

def clean_name(name):
    name = name.strip()
    name = re.sub(r'\s', ' ', name)
    if not is_initialed_name(name):
        terms = name.split(" ")
        terms[0] = terms[0].strip(".")
        name = " ".join(terms)
    return name.strip('.')

print("test clean name- Colin. J. Cats:", clean_name("Colin. J. Cats"), "W. B. Easy:", clean_name("W. B. Easy"))

class Article:
    def __init__(self, first_author, all_names, year, month, title, journal):
        # clean the author names
        # - remove non-ascii whitespace
        # - strip bookend whitespace
        # - strip periods from first names if not an initialed name
        
        
        self.first_author = clean_name(first_author)
        self.names = [clean_name(name) for name in all_names]
        self.year = year
        self.month = month
        self.title = title
        self.journal = journal
        
        # create a unique identifier for this article
        self.id = "_".join([journal, year, month, "_".join(title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(first_author)
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        return m
        
        
def article_from_map(article_map):
    return Article(
        first_author=article_map["first_author"],
        all_names=article_map["all_names"],
        year=article_map["year"],
        month=article_map["month"],
        title=article_map["title"],
        journal=article_map["journal"])

test_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)

print("last name set:", test_article.last_name_set())
print("article id:", test_article.id)
print("article has initial:", test_article.has_initials)
print("article map:", test_article.to_map())

test is_initialed_name- True: True , False: False
test contains_initialed_name- True True , False: False
test clean name- Colin. J. Cats: Colin J. Cats W. B. Easy: W. B. Easy
last name set: {'Cats', 'Dog', 'Cat'}
article id: GeoCatography_2019_02_a_story_of_coolcats
article has initial: False
article map: {'first_author': 'cat', 'all_names': ['A. Cat', 'Dog', 'Another Cat', 'More Cats'], 'year': '2019', 'month': '02', 'title': 'a story of cool/cats', 'journal': 'GeoCatography', 'id': 'GeoCatography_2019_02_a_story_of_coolcats', 'has_initials': False}


In [6]:
# Page parse functions

def get_parse_function(filename):
    if filename.startswith("JGR") or filename.startswith("GRL"):
        return parse_agu_page
    if filename.startswith("Geology") or filename.startswith("GSA"):
        return parse_gsw_page
    if filename.startswith("NatureGeoscience"):
        return parse_ng_page
    if filename.startswith("Quaternary") or filename.startswith("Geochimica"):
        return parse_sd_page
    return None
    if filename.startswith("GJI"):
        return parse_gji_page
    return None

def parse_agu_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="item__body")
    for a in articles:
        meta_title = a.find_all(class_="meta__title")
        title = meta_title[0].find_all("a", class_="publication_title")
        #title = title[0].string # This doesn not work well with special characters in the title
        title = title[0].get_text()

        if title == "Issue Information":
            print('hit "Issue Information"')
            continue
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title")
            continue

        authors = []
        author_spans = a.find_all("a", class_="publication_contrib_author")
        for p in author_spans:
            author_span = p.span
            if author_span.i is not None:
                author_span.i.decompose()
            authors.append(author_span.string)

        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue

        article = Article(
                first_author=authors[0],
                all_names=authors,
                year=year,
                month=month,
                title=title,
                journal=journal)
        parsed_articles.append(article)
    return parsed_articles

def parse_gsw_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0].string
        
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title")
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="wi-fullname")
        for author in author_spans:
            author = author.find_all("a")[0].string
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_ng_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("li", class_="pb20")
    print("num articles:", len(articles))
    for a in articles:
        title_section = a.find_all("h2", class_="h3")
        if title_section[0] is None:
            print("title section is none:", a)
            continue
        title = title_section[0].a.contents[0]
        
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title")
            continue
        title = title.strip()
        
        authors = []
        author_span = a.find_all("ul", class_="js-list-authors-3")
        for auths in author_span:
            for author in auths.find_all("a", class_="js-no-scroll"):
                author = author.contents[0]
                
                if author == "[…]" or "Show fewer authors" in author:
                    continue
                    
                authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)
        
    return parsed_articles

        
def parse_sd_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("li", class_="ResultItem")
    for a in articles:
        title = a.find_all("a", class_="result-list-title-link")
        title = title[0].contents[0]
        if isinstance(title, NavigableString):
            title = title.string
        if isinstance(title, Tag):
            title = title.get_text()

        if title == None:
            print("hit NoneType title")
            print("title:", a.find_all("a", class_="result-list-title-link"))
            continue
        if len(title) == 0:
            print("hit empty title")
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="author")
        for author in author_spans:
            author = author.string
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles


def parse_gji_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0].string
        
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title")
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="wi-fullname")
        for author in author_spans:
            author = author.find_all("a")[0].string
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles




In [8]:
# Walk directory, parse, and save parsed articles

for _, _, files in os.walk(RAW_PAGES_DIR):
    for file in files:
        
#         # CHANGE THIS TO FILTER FOR SPECIFIC JOURNALS
#         if not (file.startswith("Quaternary") or file.startswith("Geochimica")): 
#             continue
        
        print("processing file:", file)
        journal, year, month, _ = file.split("_")

        html = ""
        with codecs.open(RAW_PAGES_DIR+file, "r", "utf8") as infile:
            html = infile.read()
        
        soup = BeautifulSoup(html, "html.parser")
        
        parser = get_parse_function(file)
        if parser is None:
            print("got None parse function, skipping file:", file)
            continue
        
        for article in parser(soup, year, month, journal):
            outfile_name = article.id[:80]+".json"
            with codecs.open(PARSED_PAGES_DIR+outfile_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(article.to_map()))
            
print("Done!")

processing file: JGRSolidEarth_2019_10_1.html
Done!


In [50]:
# Create in-memory map for name guessing

# This map is generated from the articles where names are not initialed.
# The keys on the map are last names. The values are arrays of articles
# where at least one author on the article has the keyed last name.
last_names_to_articles = {}

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
        last_name = art.first_author.split(" ")[-1]
        if last_name not in last_names_to_articles.keys():
            last_names_to_articles[last_name] = []
        last_names_to_articles[last_name].append(art)
        
print("last name map size", len(last_names_to_articles.keys()))

last name map size 90


In [51]:
class Guessed:
    def __init__(self, primary_article, guessed_name, match_article_id):
        self.first_author = primary_article.first_author
        self.names = primary_article.names
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        
        # create a unique identifier for this article
        self.id = "_".join([self.journal, self.year, self.month, "_".join(self.title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(self.first_author)
        
        self.guessed_name = guessed_name
        self.match_article_id = match_article_id
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        m["guessed_name"] = self.guessed_name
        m["match_article_id"] = self.match_article_id
        return m
        
        
def guessed_from_map(guessed_map):
    g = Guessed(
        primary_article=article_from_map(guessed_map),
        guessed_name=guessed_map["guessed_name"],
        match_article_id=guessed_map["match_article_id"])
    return g

test_primary_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)
test_match_article = Article(
    first_author="cat",
    all_names=["Arthur Cat", "Dog", "Another Cat", "More Cats"],
    year="2018",
    month="05",
    title="existence of cool/cats",
    journal="GeoCatography"
)
test_guessed = Guessed(test_primary_article, "Arthur Cat", test_match_article.id)

print(test_guessed.to_map())
print(guessed_from_map(test_guessed.to_map()).to_map() == test_guessed.to_map())

{'first_author': 'cat', 'all_names': ['A. Cat', 'Dog', 'Another Cat', 'More Cats'], 'year': '2019', 'month': '02', 'title': 'a story of cool/cats', 'journal': 'GeoCatography', 'id': 'GeoCatography_2019_02_a_story_of_coolcats', 'has_initials': False, 'guessed_name': 'Arthur Cat', 'match_article_id': 'GeoCatography_2018_05_existence_of_coolcats'}
True


In [52]:
# How many documents have initialed first authors
count = 0
for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not art.has_initials:
            continue
        count += 1
        
count

45

In [53]:
# Create guesses for first author names and save to files

def extract_name_guess(initial_name, possible_names):
    last_name = initial_name.split(" ")[-1]
    for pn in possible_names:
        if last_name != pn.split(" ")[-1]:
            continue  # not the name we're looking for
        if initial_name[0] != pn[0]:
            continue  # first letters do not match
        if is_initialed_name(pn):
            continue
        return pn, True
    return "", False


# unsupported edge cases
#
# Check for hyphenated name
# X.‐J. Zhang => Xuanze Zhang
# X.‐J. Zhang => Xiao‐Jia Zhang
# X.‐J. Zhang => Xu Zhang
# X.‐J. Zhang => Xiaojia Zhang
# X.‐J. Zhang => Xiaohe Zhang
#
# Check that the same initialized name
# is not mapped to multiple different
# complete names
# J. M. Li => Jia Li
# J. M. Li => Jiaxun Li
# J. M. Li => Jingyuan Li


unmapped_names = set()

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not art.has_initials:
            continue
        last_name = art.first_author.split(" ")[-1]
        if last_name not in last_names_to_articles.keys():
            unmapped_names.add(art.first_author)
            continue
        articles = last_names_to_articles[last_name]
        
        # gather all the guesses, paired with the size of overlap.
        # guesses with largest overlap will be written to file. 
        guesses = []
        for article in articles:
            overlap = article.last_name_set() & art.last_name_set()
            if len(overlap) < 2:
                continue  # skip articles without enough overlap
            name, ok = extract_name_guess(art.first_author, article.names)
            if not ok:
                continue
                
            guessed = Guessed(art, name, article.id)
            guesses.append((len(overlap), guessed))
            
        if len(guesses) == 0:
            continue
            
        sorted(guesses, key=lambda x: x[0], reverse=True)
        the_guess = guesses[0][1]
             
        outfile_name = the_guess.id[:80]+".json"
        with codecs.open(GUESSED_NAMES_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(the_guess.to_map()))

In [54]:
len(unmapped_names)

42

In [55]:
# Collect list of all names to be gendered

guessed_first_names = set()

for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            guess = guessed_from_map(json.loads(infile.read()))
        first_name = guess.guessed_name.split(" ")[0]
        if len(first_name) == 0:
            print("empty first name?:", guess.guessed_name)
            continue
        guessed_first_names.add(first_name)
        
print("number of unique first names to guess:", len(guessed_first_names))
guessed_first_names
        

number of unique first names to guess: 1


{'Janet'}

In [56]:
# Test the gender api

class GenderResult(object):
    def __init__(self, name, result):
        self.name = name
        self.binary = result[0]
        self.percent = result[1]
        self.count = result[2]
        
    def to_map(self):
        m = {}
        m["name"] = self.name
        m["binary"] = self.binary
        m["percent"] = self.percent
        m["count"] = self.count
        return m
    

def gender_result_from_map(m):
    return GenderResult(m["name"], ( m["binary"], m["percent"], m["count"]))

name = "Taylor"
g = getGenders(name)[0]
gr = GenderResult(name, g)
gender_result_from_map(json.loads(json.dumps(gr.to_map()))).to_map()

{'name': 'Taylor', 'binary': 'male', 'percent': 0.72, 'count': 3370}

In [57]:
# Call the Gender API and save output for all the names
# WARN: Makes API calls

for name in guessed_first_names:
    result = getGenders(name)
    if len(result) > 1:
        print("long result:", result)
    r = result[0]
    gr = GenderResult(name, r)
    
    file_name = "_".join(name.translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
    with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
        outfile.write(json.dumps(gr.to_map()))


In [58]:
# Read the Gender API gender guesses into memory

gender_results = []

for _, _, files in os.walk(NAME_GENDER_DIR):
    for file in files:
        with codecs.open(NAME_GENDER_DIR+file, "r", "utf8") as infile:
            gr = gender_result_from_map(json.loads(infile.read()))
            gender_results.append(gr)
            
len(gender_results)
        

1

In [59]:
# Create a map of first name to gender result

name_to_gr = {}
for gr in gender_results:
    name_to_gr[gr.name] = gr
    
len(name_to_gr)


1

In [60]:
"Colin." in name_to_gr

False

In [61]:
# Determine how many male/female/unknown names exist in the guessed names
# NOTE: This ration should be for all guesses, not the reduced set() of names

binary_ratio_first_names = {'male':0, 'female':0, 'None':0}

for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:
#         if not file.startswith("JGROceans"):
#             continue
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            guess = guessed_from_map(json.loads(infile.read()))
        first_name = guess.guessed_name.split(" ")[0]
        if len(first_name) == 0:
            print("empty first name?:", guess.guessed_name)
            continue
        try:
            gr = name_to_gr[first_name]
            binary_ratio_first_names[gr.binary] += 1
        except KeyError:
            print("not found in map:", first_name)
        
        
binary_ratio_first_names

{'male': 0, 'female': 1, 'None': 0}

In [62]:
# Create a set of first names from all scraped journals

all_first_names = set()

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
        first_name = art.first_author.split(" ")[0]
        all_first_names.add(first_name)
        
        
print("first name set size", len(all_first_names))

first name set size 88


In [63]:
# Call the gender API for all the first names. Deduped against
# existing files in the map name_to_gr. Be sure to run the cell
# that loads that map before running this function. When properly
# loaded, this cell can be re-run until all names are handled.

print("starting size:", len(name_to_gr))

handled = 0

for name in all_first_names:
    if name in name_to_gr:
        continue
        
    handled += 1
    
    result = getGenders(name)
    if len(result) > 1:
        print("long result:", result)
    r = result[0]
    gr = GenderResult(name, r)
    
    file_name = "_".join(name.translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
    with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
        outfile.write(json.dumps(gr.to_map()))
        
    name_to_gr[name] = gr
    if handled > 970:
        print("hit request limit")
        break
        
    
print("finish size:", len(name_to_gr))


starting size: 1
finish size: 88


In [64]:
len(name_to_gr)

88

In [68]:
# Determine how many male/female/unknown names exist in the guessed names
# printed out for each journal.


journal_names = [
"Tectonophysics",
"Physics of the Earth and Planetary Interiors",
"Earth and Planetary Science Letters",
"SolidEarth",
"GEOPHYSICS"
"NatureGeoscience",
"Geology",
"GSA+Bulletin",
"JGRAtmosphere",
"JGREarthSurface",
"GRL",
"JGROceans",
"JGRSolidEarth",
"JGRSpacePhysics",
"JGRBioGeoSciences",
"JGRPlanets",
"GJI",
"Nature",
"Bulletin+of+the+Seismological+Society+of+America",
"Seismological+Research+Letters"]


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    for _, _, files in os.walk(GUESSED_NAMES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
                guess = guessed_from_map(json.loads(infile.read()))
            first_name = guess.guessed_name.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", guess.guessed_name)
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names)

Quaternary {'male': 0, 'female': 0, 'None': 0}
Geochimica {'male': 0, 'female': 0, 'None': 0}
NatureGeoscience {'male': 0, 'female': 0, 'None': 0}
Geology {'male': 0, 'female': 0, 'None': 0}
GSA+Bulletin {'male': 0, 'female': 0, 'None': 0}
JGRAtmosphere {'male': 0, 'female': 0, 'None': 0}
JGREarthSurface {'male': 0, 'female': 0, 'None': 0}
GRL {'male': 0, 'female': 1, 'None': 0}
JGROceans {'male': 0, 'female': 0, 'None': 0}
JGRSolidEarth {'male': 0, 'female': 0, 'None': 0}
JGRSpacePhysics {'male': 0, 'female': 0, 'None': 0}
JGRBioGeoSciences {'male': 0, 'female': 0, 'None': 0}
JGRPlanets {'male': 0, 'female': 0, 'None': 0}
GJI {'male': 0, 'female': 0, 'None': 0}


In [69]:
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
            first_name = article.first_author.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                initialed_name_count += 1
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names, "initialed name count:", initialed_name_count)

Quaternary {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
Geochimica {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
NatureGeoscience {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
Geology {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
GSA+Bulletin {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
JGRAtmosphere {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
JGREarthSurface {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
GRL {'male': 64, 'female': 24, 'None': 8} initialed name count: 45
JGROceans {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
JGRSolidEarth {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
JGRSpacePhysics {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
JGRBioGeoSciences {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
JGRPlanets {'male': 0, 'female': 0, 'None': 0} initialed name count: 0
GJI {'male': 0, 'female': 0, 'None': 0} initialed name

In [70]:
# Same as above but split into year/month bins
#
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.

journal_tallys = {}

for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0, 'initialed':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
                
            tally = journal_tallys.setdefault(journal, {}).setdefault(article.year, {}).setdefault(article.month, {'male':0, 'female':0, 'None':0, 'initialed':0})
            
            first_name = article.first_author.strip().split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                tally['initialed'] += 1
                continue
            try:
                gr = name_to_gr[first_name]
                tally[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


for journal, values in journal_tallys.items():
    print(journal, values)

GRL {'2010': {'05': {'male': 3, 'female': 0, 'None': 0, 'initialed': 2}, '10': {'male': 1, 'female': 2, 'None': 0, 'initialed': 2}, '11': {'male': 2, 'female': 2, 'None': 0, 'initialed': 2}, '09': {'male': 3, 'female': 1, 'None': 0, 'initialed': 2}, '08': {'male': 2, 'female': 0, 'None': 1, 'initialed': 4}, '04': {'male': 2, 'female': 2, 'None': 0, 'initialed': 2}, '03': {'male': 1, 'female': 1, 'None': 1, 'initialed': 2}, '12': {'male': 2, 'female': 5, 'None': 1, 'initialed': 2}, '01': {'male': 2, 'female': 0, 'None': 0, 'initialed': 2}, '06': {'male': 2, 'female': 2, 'None': 0, 'initialed': 4}, '07': {'male': 5, 'female': 0, 'None': 0, 'initialed': 0}, '02': {'male': 1, 'female': 0, 'None': 0, 'initialed': 2}}, '2011': {'09': {'male': 5, 'female': 0, 'None': 0, 'initialed': 2}, '07': {'male': 1, 'female': 4, 'None': 0, 'initialed': 3}, '05': {'male': 8, 'female': 2, 'None': 0, 'initialed': 0}, '08': {'male': 6, 'female': 1, 'None': 1, 'initialed': 1}, '06': {'male': 3, 'female': 0, '