In [None]:
import os
import codecs
import json
import string
import re

from bs4 import BeautifulSoup, NavigableString, Tag

import gender
from gender import getGenders

In [None]:
! pwd

In [None]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

RAW_PAGES_DIR=root+"/pages/"
PARSED_PAGES_DIR=root+"/parsed/"
GUESSED_NAMES_DIR=root+"/guessed/"
NAME_GENDER_DIR=root+"/name_genders/"

# create directories if they do not exist

for d in [RAW_PAGES_DIR,
         PARSED_PAGES_DIR,
         GUESSED_NAMES_DIR,
         NAME_GENDER_DIR]:
    try:
        os.mkdir(d)
    except FileExistsError:
        pass

### Article class, Guessed class, and function definitions

In [None]:
# Initialize and define useful functions and
# data structures.

def is_initialed_name(name):
    first_term = name.split(" ")[0]
    if len(first_term) == 0:
        # print("first term len zero. name:", name)
        return False
    return first_term[-1] == "." and first_term[:-1].isupper()

print("test is_initialed_name- True:", is_initialed_name("J. Smith"), ", False:", is_initialed_name("Joe Smith"))

def contains_initialed_name(names):
    for name in names:
        if is_initialed_name(name):
            return True
    return False

print("test contains_initialed_name- True", contains_initialed_name(["J. Smith", "Cat Meowins"]), ", False:", contains_initialed_name(["Joe Smith", "Cat Meowins"]))

def clean_name(name):
    name = name.strip()
    name = re.sub(r'\s', ' ', name)
    if not is_initialed_name(name):
        terms = name.split(" ")
        terms[0] = terms[0].strip(".")
        name = " ".join(terms)
    return name.strip('.')

print("test clean name- Colin. J. Cats:", clean_name("Colin. J. Cats"), "W. B. Easy:", clean_name("W. B. Easy"))

class Article:
    def __init__(self, first_author, all_names, year, month, title, journal):
        # clean the author names
        # - remove non-ascii whitespace
        # - strip bookend whitespace
        # - strip periods from first names if not an initialed name
        
        
        self.first_author = clean_name(first_author)
        self.names = [clean_name(name) for name in all_names]
        self.year = year
        self.month = month
        self.title = title
        self.journal = journal
        
        # create a unique identifier for this article
        self.id = "_".join([journal, year, month, "_".join(title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(first_author)
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        return m
        
        
def article_from_map(article_map):
    return Article(
        first_author=article_map["first_author"],
        all_names=article_map["all_names"],
        year=article_map["year"],
        month=article_map["month"],
        title=article_map["title"],
        journal=article_map["journal"])

test_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)

print("last name set:", test_article.last_name_set())
print("article id:", test_article.id)
print("article has initial:", test_article.has_initials)
print("article map:", test_article.to_map())


class Guessed:
    def __init__(self, primary_article, guessed_name, match_article_id):
        self.first_author = primary_article.first_author
        self.names = primary_article.names
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        
        # create a unique identifier for this article
        self.id = "_".join([self.journal, self.year, self.month, "_".join(self.title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(self.first_author)
        
        self.guessed_name = guessed_name
        self.match_article_id = match_article_id
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        m["guessed_name"] = self.guessed_name
        m["match_article_id"] = self.match_article_id
        return m
        
        
def guessed_from_map(guessed_map):
    g = Guessed(
        primary_article=article_from_map(guessed_map),
        guessed_name=guessed_map["guessed_name"],
        match_article_id=guessed_map["match_article_id"])
    return g

test_primary_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)
test_match_article = Article(
    first_author="cat",
    all_names=["Arthur Cat", "Dog", "Another Cat", "More Cats"],
    year="2018",
    month="05",
    title="existence of cool/cats",
    journal="GeoCatography"
)
test_guessed = Guessed(test_primary_article, "Arthur Cat", test_match_article.id)

print(test_guessed.to_map())
print(guessed_from_map(test_guessed.to_map()).to_map() == test_guessed.to_map())

In [None]:
# Page parse functions

def get_parse_function(filename):
    if filename.startswith("JGR") or filename.startswith("GRL") or filename.startswith("G3"):
        return parse_agu_page
    if filename.startswith("Seismological+Research+Letters") or filename.startswith("Bulletin+of+the+Seismological+Society+of+America"):
        return parse_gsw_page
    if filename.startswith("NatureGeoscience") or filename.startswith("Nature"):
        return parse_ng_page
    if filename.startswith("Earth+and+Planetary+Science+Letters") or filename.startswith("Physics+of+the+Earth+and+Planetary+Interiors") or filename.startswith("Tectonophysics"):
        return parse_sd_page
    if filename.startswith("GJI"):
        return parse_gji_page
    if filename.startswith("SolidEarth"):
        return parse_solidearth_page
    if filename.startswith("GEOPHYSICS"):
        return parse_geophysics_page
    if filename.startswith("Science"):
        return parse_science_page
    return None




def parse_agu_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="item__body")
    for a in articles:
        meta_title = a.find_all(class_="meta__title")
        title = meta_title[0].find_all("a", class_="publication_title")
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
       
        if title == "Issue Information":
            print('hit "Issue Information"')
            continue
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue

        authors = []
        author_spans = a.find_all("a", class_="publication_contrib_author")
        for p in author_spans:
            author_span = p.span
            if author_span.i is not None:
                author_span.i.decompose()
            authors.append(str(author_span.string))
   
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue

        article = Article(
                first_author=authors[0],
                all_names=authors,
                year=year,
                month=month,
                title=title,
                journal=journal)
        parsed_articles.append(article)
    return parsed_articles



def parse_gsw_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="wi-fullname")
        for author in author_spans:
            author = str(author.find_all("a")[0].get_text())
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_ng_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("li", class_="pb20")

    for a in articles:
        title_section = a.find_all("h2", class_="h3")
        if title_section[0] is None:
            print("title section is none:", a)
            continue
        title = title_section[0].a.contents[0]
        
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue
        title = title.strip()
        
        
        authors = []
        author_span = a.find_all("ul", class_="js-list-authors-3")
        for auths in author_span:
            for author in auths.find_all("a", class_="js-no-scroll"):
                author = author.contents[0]
                
                if author == "[…]" or "Show fewer authors" in author:
                    continue
                    
                authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)
        
    return parsed_articles

        
def parse_sd_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="result-item-container")
    for a in articles:
        title = a.find_all("a", class_="result-list-title-link")
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())

        if title == "None":
            print("hit NoneType title")
            print("title:", a.find_all("a", class_="result-list-title-link"))
            continue
        if len(title) == 0:
            print("hit empty title")
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="author")
        for author in author_spans:
            author = author.string
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_solidearth_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="paperlist-object")
    for a in articles:
        title = a.find_all("a", class_="article-title") 
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue

        
        authors = []
        try:
            author_spans = a.find_all("div", class_="authors")[0]
        except IndexError:
            print("hit empty authors.")
            continue
    
        for author in author_spans.string.split(","):
            author = str(author).strip()
            if author[0:3] == "and":
                author = author[3:].strip()            
            authors.append(author)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_geophysics_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="issue-item__body")
    for a in articles:
        title = a.find_all(class_="issue-item__title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
            
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue
        
        authors = []
        author_spans = a.find("div", class_="issue-item__authors")
        a = author_spans.find_all("a")
        for auth in a:
            author = str(auth.string)
            authors.append(author)
        
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_gji_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue        
        
        authors = []
        author_spans = a.find_all("div", class_="sri-authors al-authors-list")
        for au_span in author_spans:
            aus = au_span.find_all("a")
            for au in aus:
                authors.append(str(au.get_text()))
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_science_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="results-cit")

    for a in articles:
        title_span = a.find("span",  class_="cit-first-element")
        title = str(title_span.text)
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="cit-auth")
        for au_span in author_spans:
            au = au_span.text.strip()
            if au != "and":
                authors.append(au)
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)


    return parsed_articles

#### Create one json per article with article info, stored in parsed/ 

In [None]:
# Walk directory, parse, and save parsed articles

for _, _, files in os.walk(RAW_PAGES_DIR):
    for file in files:
        
#         # CHANGE THIS TO FILTER FOR SPECIFIC JOURNALS
        #if (file.startswith("JGR") or file.startswith("GRL") or file.startswith("G3")): 
        #    continue
        
        parser = get_parse_function(file)
        if parser is None:
            print("got None parse function, skipping file:", file)
            continue
        #print("processing file:", file)
        print(file[0:3], end=",")
        journal, year, month, _ = file.split("_")

        html = ""
        with codecs.open(RAW_PAGES_DIR+file, "r", "utf8") as infile:
            html = infile.read()
        
        soup = BeautifulSoup(html, "html.parser")
        
        for article in parser(soup, year, month, journal):
            outfile_name = article.id[:80]+".json"
            with codecs.open(PARSED_PAGES_DIR+outfile_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(article.to_map()))
            
print("Done!")

#### Start with the name --> gender: test genderize.io api

In [None]:
# class for gender result and test
class GenderResult(object):
    def __init__(self, name, result):
        self.name = name
        self.binary = result[0]
        self.percent = result[1]
        self.count = result[2]
        
    def to_map(self):
        m = {}
        m["name"] = self.name
        m["binary"] = self.binary
        m["percent"] = self.percent
        m["count"] = self.count
        return m
    

def gender_result_from_map(m):
    return GenderResult(m["name"], ( m["binary"], m["percent"], m["count"]))

name = "Taylor"
g = getGenders(name)[0]
gr = GenderResult(name, g)
gender_result_from_map(json.loads(json.dumps(gr.to_map()))).to_map()

#### read all the previously determined and stored name --> gender pairs

In [None]:
# read the Gender API gender guesses into memory

gender_results = []

for _, _, files in os.walk(NAME_GENDER_DIR):
    for file in files:
        with codecs.open(NAME_GENDER_DIR+file, "r", "utf8") as infile:
            gr = gender_result_from_map(json.loads(infile.read()))
            gender_results.append(gr)
            
print("We have *{}* stored name informations in\n{}\nfrom previous runs.".format(len(gender_results), NAME_GENDER_DIR))
        
    
# Create a map of first name to gender result
name_to_gr = {}
for gr in gender_results:
    name_to_gr[gr.name] = gr
    
print("Length of current name-gender map", len(name_to_gr))

# test
print("Colin." in name_to_gr)
print("First name in current gender map, gender: ", \
      list(name_to_gr)[0], name_to_gr[list(name_to_gr)[0]].binary)
print("Last name in current gender map, gender: ", \
      list(name_to_gr)[-1], name_to_gr[list(name_to_gr)[-1]].binary)

#### Determine how many papers have initialed authors, and try to guess their names by comparing to non-initialed authors

In [None]:
# How many documents have initialed first authors
count = 0
for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not art.has_initials:
            continue
        count += 1
        
print("{} papers have initialed author names.".format(count))

# Create in-memory map for name guessing

# This map is generated from the articles where names are not initialed.
# The keys on the map are last names. The values are arrays of articles
# where at least one author on the article has the keyed last name.


## TODO: This could be improved. last_names_to_articles dictionary should also
# contain the last names of the coauthors.

last_names_to_articles = {}

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
        #last_name = art.first_author.split(" ")[-1]
        #if last_name not in last_names_to_articles.keys():
        #    last_names_to_articles[last_name] = []
        #last_names_to_articles[last_name].append(art)
        for name in art.names:
            last_name = name.split(" ")[-1]
            if last_name not in last_names_to_articles.keys():
                last_names_to_articles[last_name] = []
            last_names_to_articles[last_name].append(art)
        
print("All last names, map size", len(last_names_to_articles.keys()))

In [None]:
# Create guesses for first author names and save to files
def extract_name_guess(initial_name, possible_names):
    last_name = initial_name.split(" ")[-1]
    for pn in possible_names:
        if last_name != pn.split(" ")[-1]:
            continue  # not the name we're looking for
        if initial_name[0] != pn[0]:
            continue  # first letters do not match
        if len(initial_name.split()) > 2 and len(pn.split()) > 2:
            # both names have middle initial
            if initial_name.split()[1] != pn.split()[1]:
                # middle initial does not fit
                continue
        if len(initial_name.split("-")) > 1 and len(pn.split("-")) > 1:
            # both names have hyphen
            if initial_name.split("-")[1][0] != pn.split("-")[1][0]:
                # second part of hyphenated name does not fit
                continue
        if is_initialed_name(pn):
            continue
        return pn, True
    return "", False

# create an output text file to quickly check if any bullshit occurs
check_file = open("output_checklist_guessednames.txt", "w")

# create a list that keeps track of guesses to flag ambiguity
all_guesses = {}

# unsupported edge cases
# Check that the same initialized name
# is not mapped to multiple different
# complete names
# J. M. Li => Jia Li
# J. M. Li => Jiaxun Li
# J. M. Li => Jingyuan Li


unmapped_names = set()

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not art.has_initials:
            continue
        last_name = art.first_author.split(" ")[-1]
        if last_name not in last_names_to_articles.keys():
            unmapped_names.add(art.first_author)
            continue
        articles = last_names_to_articles[last_name]
        
        # gather all the guesses, paired with the size of overlap.
        # guesses with largest overlap will be written to file. 
        guesses = []
        for article in articles:
            overlap = article.last_name_set() & art.last_name_set()
            if len(overlap) < 2:
                continue  # skip articles without enough overlap
            
            name, ok = extract_name_guess(art.first_author, article.names)
            if not ok:
                continue
            
            try:
                previous_guess = all_guesses[art.first_author]
                if previous_guess != name:
                    print("Detected ambiguous guess: {} - {} - {}".format(
                         art.first_author, previous_guess, name))
                    print(art.title)
                    # we could continue at this point, but we already have one of these names
                    # written in a json (previous_guess). Not sure what to do, let's discuss
            except KeyError:
                pass
            check_file.write("{}\t\t{}\n".format(name, art.first_author))
            guessed = Guessed(art, name, article.id)
            guesses.append((len(overlap), guessed))
            all_guesses[art.first_author] = name
            
        if len(guesses) == 0:
            continue
        
        guesses = sorted(guesses, key=lambda x: x[0], reverse=True) #This does not change guesses if we do not assign it to variable guesses
        the_guess = guesses[0][1]
             
        outfile_name = the_guess.id[:80]+".json"
        with codecs.open(GUESSED_NAMES_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(the_guess.to_map()))
check_file.close()
len(unmapped_names)

#### Step 1: Genderize the names GUESSED from initials
#### - collect the list of first names
#### - check if it is in the list already, if not: call genderize io
#### - finally check how many in the guessed initialed names are male / female

In [None]:
# Collect list of all GUESSED names to be gendered

guessed_first_names = set()

for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            guess = guessed_from_map(json.loads(infile.read()))
        first_name = guess.guessed_name.split(" ")[0]
        if len(first_name) == 0:
            print("empty first name?:", guess.guessed_name)
            continue
        guessed_first_names.add(first_name)
        
print("number of unique first names to guess:", len(guessed_first_names))
print("From {} to {}.".format(list(guessed_first_names)[0],
                              list(guessed_first_names)[-1]))        

In [None]:
# Call the Gender API and save output for all the GUESSED names
# WARN: Makes API calls
# Does nothing if the guessed names were genderized previously
for i, name in enumerate(guessed_first_names):
    # print(i, end=",")
    if name in name_to_gr:
        continue
    else:
        result = getGenders(name)
        if len(result) > 1:
            print("long result:", result)
        r = result[0]
        gr = GenderResult(name, r)
        
        # update gender name map in memory
        name_to_gr[gr.name] = gr
        
        # also save for later use
        file_name = "_".join(name.translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
        with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(gr.to_map()))



In [None]:
# Determine how many male/female/unknown names exist in the guessed names
# NOTE: This ratio should be for all guesses, not the reduced set() of names

binary_ratio_first_names = {'male':0, 'female':0, 'None':0}

for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:
#         if not file.startswith("JGROceans"):
#             continue
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            guess = guessed_from_map(json.loads(infile.read()))
        first_name = guess.guessed_name.split(" ")[0]
        if len(first_name) == 0:
            print("empty first name?:", guess.guessed_name)
            continue
        try:
            gr = name_to_gr[first_name]
            binary_ratio_first_names[gr.binary] += 1
        except KeyError:
            print("not found in map:", first_name)
        
        
binary_ratio_first_names

#### Now genderize the names that were given in full
#### - collect the list of first names
#### - check if name is in the list already, if not: call genderize io

In [None]:
# Create a set of first names from all scraped journals
# For Non-Initialed names

all_first_names = set()
all_names = set()


for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
        first_name = art.first_author.split(" ")[0]
        all_first_names.add(first_name)
        
        for allname in art.names:
            all_names.add(allname.split()[0])
        
        
print("first name set size", len(all_first_names))
print("all author name set size", len(all_names))


In [None]:
# Call the gender API for all the non-initialed first names. Deduped against
# existing files in the map name_to_gr. Be sure to run the cell
# that loads that map before running this function. When properly
# loaded, this cell can be re-run until all names are handled.

# TODO : Also run this on ALL author names!

print("starting size:", len(name_to_gr))

handled = 0

for name in all_first_names:
    if name in name_to_gr:
        continue
        
    handled += 1
    
    result = getGenders(name)
    if len(result) > 1:
        print("long result:", result)
    r = result[0]
    gr = GenderResult(name, r)
    
    # update map currently in memory
    name_to_gr[name] = gr
    
    # and save for later use
    file_name = "_".join(name.translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
    with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
        outfile.write(json.dumps(gr.to_map()))
        
    if handled > 970:
        print("hit request limit")
        break
        
    
print("finish size:", len(name_to_gr))

#### Finally, evaluate for all journals / each journal / journal per year..
#### Here, let's create a json database that can be used in another notebook.

In [None]:
# Go both through the guessed names and the non-initialed names.
# Find genders on the base of the name_to_gr map and save the results

# While we do not yet have a name database large enough to go through all the names,
# just go through a few for testing purposes:
nfile = 100

AUTHOR_GENDER_DIR = root + "/author_genders/"
if not os.path.exists(AUTHOR_GENDER_DIR):
    os.mkdir(AUTHOR_GENDER_DIR)

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files[0:nfile]:       
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
            
        all_names = art.names
        all_genders = []
        all_percent = []

        for n in all_names:
            n = n.split()[0]
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(name_to_gr[n].percent)
            except KeyError:
                print("Name {} is not gendered yet".format(n))
                all_genders.append(None)
                all_percent.append(None)
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
        #print(art_out["all_names"])
        print(art_out["all_genders"])
       
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))
# now do the guessed names
for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files[0: nfile]:       
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if art.has_initials:
            continue
            
        all_names = art.names
        all_genders = []
        all_percent = []

        for n in all_names:
            n = n.split()[0]
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(name_to_gr[n].percent)
            except KeyError:
                print("Name {} is not gendered yet".format(n))
                all_genders.append(None)
                all_percent.append(None)
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
        print(art_out["all_genders"])
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))

In [None]:
# the following part may be moved to a new notebook?

In [None]:
# Determine how many male/female/unknown names exist in the guessed names
# printed out for each journal.


journal_names = [
"Tectonophysics",
"Physics+of+the+Earth+and+Planetary+Interiors",
"Earth+and+Planetary+Science+Letters",
"SolidEarth",
"GEOPHYSICS",
"NatureGeoscience",
"GRL",
"JGRSolidEarth",
"G3",
"GJI",
"Nature",
"Bulletin+of+the+Seismological+Society+of+America",
"Seismological+Research+Letters"]


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    for _, _, files in os.walk(GUESSED_NAMES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
                guess = guessed_from_map(json.loads(infile.read()))
            first_name = guess.guessed_name.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", guess.guessed_name)
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names)

In [None]:
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
            first_name = article.first_author.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                initialed_name_count += 1
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names, "initialed name count:", initialed_name_count)

In [None]:
# Same as above but split into year/month bins
#
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.

journal_tallys = {}

for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0, 'initialed':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
                
            tally = journal_tallys.setdefault(journal, {}).setdefault(article.year, {}).setdefault(article.month, {'male':0, 'female':0, 'None':0, 'initialed':0})
            
            first_name = article.first_author.strip().split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                tally['initialed'] += 1
                continue
            try:
                gr = name_to_gr[first_name]
                tally[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


for journal, values in journal_tallys.items():
    print(journal, values)