In [None]:
import os
import codecs
import json
import string
import re
from glob import glob
from bs4 import BeautifulSoup, NavigableString, Tag

import gender
from gender import getGenders

from fuzzywuzzy import process, fuzz

import time
import openapi_client # pip install git+https://github.com/namsor/namsor-python-sdk2.git
from openapi_client.rest import ApiException
from pprint import pprint


In [None]:
! pwd

In [None]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

RAW_PAGES_DIR=root+"/pages/"
PARSED_PAGES_DIR=root+"/parsed/"
GUESSED_NAMES_DIR=root+"/guessed/"
NAME_GENDER_DIR=root+"/name_genders/"

# create directories if they do not exist

for d in [RAW_PAGES_DIR,
         PARSED_PAGES_DIR,
         GUESSED_NAMES_DIR,
         NAME_GENDER_DIR]:
    try:
        os.mkdir(d)
    except FileExistsError:
        pass

In [None]:
#! rm $GUESSED_NAMES_DIR*

### Article class, Guessed class, and function definitions

In [None]:
# Initialize and define useful functions and
# data structures.

def is_initialed_name(name):
    first_term = name.split(" ")[0]
    if len(first_term) == 0:
        # print("first term len zero. name:", name)
        return False
    return first_term[-1] == "." and first_term[:-1].isupper() or len(first_term) == 1 or len(first_term.split("-")[0]) == 1


print("test is_initialed_name- True:", is_initialed_name("J. Smith"), ", False:", is_initialed_name("Joe Smith"),
     ", True:", is_initialed_name("J Smith"), ", True:", is_initialed_name("J-P Ampuero"))

def contains_initialed_name(names):
    for name in names:
        if is_initialed_name(name):
            return True
    return False

print("test contains_initialed_name- True", contains_initialed_name(["J. Smith", "Cat Meowins"]), ", False:", contains_initialed_name(["Joe Smith", "Cat Meowins"]))

def clean_name(name):
    name = name.strip()
    name = re.sub(r'\s', ' ', name)
    if not is_initialed_name(name):
        terms = name.split(" ")
        terms[0] = terms[0].strip(".")
        name = " ".join(terms)
    return name.strip('.')

print("test clean name- Colin. J. Cats:", clean_name("Colin. J. Cats"), "W. B. Easy:", clean_name("W. B. Easy"))

class Article:
    def __init__(self, first_author, all_names, year, month, title, journal):
        # clean the author names
        # - remove non-ascii whitespace
        # - strip bookend whitespace
        # - strip periods from first names if not an initialed name
        
        
        self.first_author = clean_name(first_author)
        self.names = [clean_name(name) for name in all_names]
        self.year = year
        self.month = month
        self.title = title
        self.journal = journal
        
        # create a unique identifier for this article
        self.id = "_".join([journal, year, month, "_".join(title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(first_author)
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        return m
        
        
def article_from_map(article_map):
    return Article(
        first_author=article_map["first_author"],
        all_names=article_map["all_names"],
        year=article_map["year"],
        month=article_map["month"],
        title=article_map["title"],
        journal=article_map["journal"])

test_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)

print("last name set:", test_article.last_name_set())
print("article id:", test_article.id)
print("article has initial:", test_article.has_initials)
print("article map:", test_article.to_map())


class Guessed:
    def __init__(self, primary_article, guessed_names, match_article_id):
        self.first_author = primary_article.first_author
        self.names = primary_article.names
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        
        # create a unique identifier for this article
        self.id = "_".join([self.journal, self.year, self.month, "_".join(self.title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = True in [is_initialed_name(n) for n in self.names]
        
        self.guessed_names = guessed_names
        self.match_article_id = match_article_id
    
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        m["guessed_names"] = self.guessed_names
        m["match_article_id"] = self.match_article_id
        return m
        
        
def guessed_from_map(guessed_map):
    g = Guessed(
        primary_article=article_from_map(guessed_map),
        guessed_names=guessed_map["guessed_names"],
        match_article_id=guessed_map["match_article_id"])
    return g




class Gendered:
    def __init__(self, primary_article, all_genders, all_percent):
        self.first_author = primary_article.first_author
        
        try:
            self.names = primary_article.guessed_names
        except:
            self.names = primary_article.names
        
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        self.id = primary_article.id
        
        # gendered:
        self.all_genders = all_genders
        self.all_percent = all_percent
    
#     def last_name_set(self):
#         # return a set() of all the last names
#         name_set = set()
#         for name in self.names:
#             name_set.add(name.split(" ")[-1])
#         return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["all_genders"] = self.all_genders
        m["all_percent"] = self.all_percent
        return m
        
        
def gendered_from_map(dictfromjson):
    
    # is the primary article guessed authors?
    # if yes:
    # use guessed_from_map
    # if not:
    # use article_from_map
    if "guessed_names" in list(dictfromjson.keys()):
        prim_art = guessed_from_map(dictfromjson)
    else:
        prim_art = article_from_map(dictfromjson)
    
    g = Gendered(
        primary_article=prim_art,
        all_genders=dictfromjson["all_genders"],
        all_percent=dictfromjson["all_percent"])
    return g


test_primary_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)
test_match_article = Article(
    first_author="cat",
    all_names=["Arthur Cat", "Dog", "Another Cat", "More Cats"],
    year="2018",
    month="05",
    title="existence of cool/cats",
    journal="GeoCatography"
)
test_guessed = Guessed(test_primary_article, "Arthur Cat", test_match_article.id)



test_gendered_article = Gendered(
    test_primary_article,
    all_genders=["init", "male", None, "female"],
    all_percent=[0.0, 0.6, None, 0.5]
)
print(test_guessed.to_map())
print(guessed_from_map(test_guessed.to_map()).to_map() == test_guessed.to_map())

In [None]:
# Define page parse functions, specific for each journal

def get_parse_function(filename):
    if filename.startswith("JGR") or filename.startswith("GRL") or filename.startswith("G3"):
        return parse_agu_page
    if filename.startswith("Seismological+Research+Letters") or filename.startswith("Bulletin+of+the+Seismological+Society+of+America"):
        return parse_gsw_page
    if filename.startswith("NatureGeoscience") or filename.startswith("Nature"):
        return parse_ng_page
    if filename.startswith("E%26PSL") or filename.startswith("PEPI") or filename.startswith("Tectp"):
        return parse_sd_page
    if filename.startswith("GJI"):
        return parse_gji_page
    if filename.startswith("SolidEarth"):
        return parse_solidearth_page
    if filename.startswith("GEOPHYSICS"):
        return parse_geophysics_page
    if filename.startswith("Science"):
        return parse_science_page
    return None




def parse_agu_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="item__body")
    for a in articles:
        meta_title = a.find_all(class_="meta__title")
        title = meta_title[0].find_all("a", class_="publication_title")
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
       
        if title == "Issue Information":
            print('hit "Issue Information"')
            continue
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue

        authors = []
        author_spans = a.find_all("a", class_="publication_contrib_author")
        for p in author_spans:
            author_span = p.span
            if author_span.i is not None:
                author_span.i.decompose()
            authors.append(str(author_span.string))
   
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue

        article = Article(
                first_author=authors[0],
                all_names=authors,
                year=year,
                month=month,
                title=title,
                journal=journal)
        parsed_articles.append(article)
    return parsed_articles



def parse_gsw_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="wi-fullname")
        for author in author_spans:
            author = str(author.find_all("a")[0].get_text())
            authors.append(author)
          
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_ng_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("li", class_="pb20")

    for a in articles:
        title_section = a.find_all("h2", class_="h3")
        if title_section[0] is None:
            print("title section is none:", a)
            continue
        title = title_section[0].a.contents[0]
        
        if title == None:
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue
        title = title.strip()
        
        
        authors = []
        author_span = a.find_all("ul", class_="js-list-authors-3")
        for auths in author_span:
            for author in auths.find_all("a", class_="js-no-scroll"):
                author = author.contents[0]
                
                if author == "[…]" or "Show fewer authors" in author:
                    continue
                
                authors.append(author)
                
                
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)
        
    return parsed_articles

        
def parse_sd_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="col-sm-12")

    for a in articles:
        title = a.find_all("h3", class_="s-results-title")
        if title == "None":
            #print("hit NoneType title")
            continue
        if len(title) == 0:
            #print("hit empty title")
            continue
        title = title[0]
        title = str(title.get_text())

        authors = []
        author_field = a.find("ul", class_="all-authors")
        if author_field is None:
            continue
        author_list = author_field.find_all("li", class_="article-author")
        for p in author_list:
            author = p.get_text().strip(";")
            name = author.split(",")[::-1]
            name = " ".join(name).strip()
            authors.append(str(name))
   
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles


def parse_solidearth_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="paperlist-object")
    for a in articles:
        title = a.find_all("a", class_="article-title") 
        title = title[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title: ", a)
            continue

        
        authors = []
        try:
            author_spans = a.find_all("div", class_="authors")[0]
        except IndexError:
            print("hit empty authors.")
            continue
    
        for author in author_spans.string.split(","):
            author = str(author).strip()
            if author[0:3] == "and":
                author = author[3:].strip()  
            if " and " in author:
                #print(author)
                tmp = author.split(" and ")
                authors.append(tmp[0])
                authors.append(tmp[1])
            else:
                authors.append(author)

            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_geophysics_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="issue-item__body")
    for a in articles:
        title = a.find_all(class_="issue-item__title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
            
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue
        
        authors = []
        author_spans = a.find("div", class_="issue-item__authors")
        a = author_spans.find_all("a")
        for auth in a:
            author = str(auth.string)
            authors.append(author)
            
        
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_gji_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="al-article-box")
    for a in articles:
        title = a.find_all(class_="al-title")
        title = title[0].find_all("a")[0]
        if isinstance(title, NavigableString):
            title = str(title.string)
        if isinstance(title, Tag):
            title = str(title.get_text())
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue        
        
        authors = []
        author_spans = a.find_all("div", class_="sri-authors al-authors-list")
        for au_span in author_spans:
            aus = au_span.find_all("a")
            for au in aus:
                authors.append(str(au.get_text()))
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)

    return parsed_articles

def parse_science_page(soup, year, month, journal):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="results-cit")

    for a in articles:
        title_span = a.find("span",  class_="cit-first-element")
        title = str(title_span.text)
        
        if title == "None":
            print("hit NoneType title")
            continue
        if len(title) == 0:
            print("hit empty title, ", a)
            continue
        
        authors = []
        author_spans = a.find_all("span", class_="cit-auth")
        for au_span in author_spans:
            au = au_span.text.strip()
            if au != "and":
                authors.append(au)
                
            
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue
    
        article = Article(
            first_author=authors[0],
            all_names=authors,
            year=year,
            month=month,
            title=title,
            journal=journal)
        parsed_articles.append(article)


    return parsed_articles

#### Create one json per article with article info, stored in parsed/ 

In [None]:
# Walk directory where pages are saved, parse, and save parsed articles

for _, _, files in os.walk(RAW_PAGES_DIR):
    for file in files:
        
#         # CHANGE THIS TO FILTER FOR SPECIFIC JOURNALS
        #if (file.startswith("JGR") or file.startswith("GRL") or file.startswith("G3")): 
        #    continue
               
        parser = get_parse_function(file)
        if parser is None:
            print("got None parse function, skipping file:", file)
            continue
        #print("processing file:", file)
        print(file[0:3], end=",")
        journal, year, month, _ = file.split("_")

        html = ""
        with codecs.open(RAW_PAGES_DIR+file, "r", "utf8") as infile:
            html = infile.read()
        
        soup = BeautifulSoup(html, "html.parser")
        
        for article in parser(soup, year, month, journal):
            outfile_name = article.id[:80]+".json"
            with codecs.open(PARSED_PAGES_DIR+outfile_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(article.to_map()))
            
print("Done!")

#### Determine how many papers have initialed authors and create a dictionary with all last names from articles and abstracts


In [None]:
# How many articles have authors with initialed names

count = 0
for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not contains_initialed_name(art.names):
            continue
        #print(art.names)
        count += 1

        
print("{count} papers from a total of {total} contain initialed author names.".format(count = count, 
                                                                                   total = len(files)))

# Create in-memory map for name guessing

# This map is generated from the articles where names are not initialed.
# The keys on the map are last names. The values are arrays of articles
# where at least one author on the article has the keyed last name.

# We create this dictionary from all articles and abstracts. 

count_init=0
count_total = 0

last_names_to_articles = {}

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
            
        # considering all authors and all articles:
        for name in art.names:
            count_total += 1
            # only add if it is not initialed
            if is_initialed_name(name):
                count_init += 1
                continue
            
            last_name = name.split(" ")[-1]            
            
            if last_name not in last_names_to_articles.keys():
                last_names_to_articles[last_name] = []
            last_names_to_articles[last_name].append(art)
        
print("All last names, map size", len(last_names_to_articles.keys()))
print("{count} author names are initialed from a total of {total} author names.".format(count = count_init,
                                                                                      total = count_total))


# We repeat the same for EGU abstracts

PARSED_EGU_PAGES_DIR = root+"/egu_parsed/"

count_init=0
count_total = 0

last_names_to_abstracts = {}

for _, _, files in os.walk(PARSED_EGU_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_EGU_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
            
        # considering all authors and all articles:
        for name in art.names:
            count_total += 1
            # only add if it is not initialed
            if is_initialed_name(name):
                count_init += 1
                continue
            
            last_name = name.split(" ")[-1]            
            
            if last_name not in last_names_to_abstracts.keys():
                last_names_to_abstracts[last_name] = []
            last_names_to_abstracts[last_name].append(art)
        
print("All last names, abstract map size", len(last_names_to_abstracts.keys()))
print("{count} author names are initialed from a total of {total} author names.".format(count = count_init,
                                                                                      total = count_total))

#### Now try to guess their names by comparing to non-initialed authors

In [None]:
# Create guesses for first author names and save to files
def extract_name_guess(initial_name, possible_names):
    last_name = initial_name.split(" ")[-1]
    for pn in possible_names:
        if last_name != pn.split(" ")[-1]:
            continue  # not the name we're looking for
        if (initial_name[0] != pn[0]) and (fuzz.token_sort_ratio(initial_name,pn) < 90):
            continue  # first letters do not match TODO: There are cases where this is not correct, e.g.: Martin Mai and P. Martin Mai
        if len(initial_name.split()) > 2 and len(pn.split()) > 2:
            # both names have middle initial
            if initial_name.split()[1][0] != pn.split()[1][0]:
                # middle initial does not fit
                continue
        if len(initial_name.split("-")) > 1 and len(pn.split("-")) > 1:
            # both names have hyphen
            if initial_name.split("-")[1][0] != pn.split("-")[1][0]:
                # second part of hyphenated name does not fit
                continue
        if is_initialed_name(pn):
            continue
        return pn, True
    return "", False

# create an output text file to quickly check if any bullshit occurs
check_file = open("output_checklist_guessednames.txt", "w")


# unsupported edge cases
# Check that the same initialized name
# is not mapped to multiple different
# complete names
# J. M. Li => Jia Li
# J. M. Li => Jiaxun Li
# J. M. Li => Jingyuan Li

count = 0
count_guess = 0
unmapped_names = set()

for _, _, files in os.walk(PARSED_PAGES_DIR):
    
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not contains_initialed_name(art.names): # take only article that contain initialed names
            continue
                
        guessed_authors = []
        
        for author in art.names: # Loop through all author in the article
            
            if not is_initialed_name(author): # take only the authors with initialed names
                guessed_authors.append(author) # save the full name for later writing the json file
                continue
                
            last_name = author.split(" ")[-1] # take the last name

            if last_name not in last_names_to_articles.keys() and\
            last_name not in last_names_to_abstracts.keys(): 
                # check if the last name is in the dictionary
                guessed_authors.append(author) # if not, we need to save it as it is
                unmapped_names.add(author) # collect all unmapped names to double-check the performance
                count += 1 
                continue
            
            # load the articles and abstracts related to the last name
            
            try:
                articles = last_names_to_articles[last_name]
            except KeyError:
                articles = []
                pass
        
            try:
                abstracts = last_names_to_abstracts[last_name]
            except KeyError:
                abstracts = []
                pass
            
            
            # gather all the guesses, paired with the size of overlap.
            # guesses with largest overlap will be written into file. 
            
            guesses = []
            possible_names = set()
            
    
            for article in articles + abstracts: 
            
                overlap = article.last_name_set() & art.last_name_set() #overlap between coauthors

                if (len(art.names) > 1) and (len(overlap) < 2): 
                    name, ok = extract_name_guess(author, article.names) # not always there is an overlap and sometimes... 
                    if ok:#...there is only one person in the community with this name...
                        possible_names.add(name.split(" ")[0] + " " + name.split(" ")[-1]) #...so save the names to check this later.
                    continue  # skip articles without enough overlap. 
            
                name, ok = extract_name_guess(author, article.names)
                if not ok:
                    continue
            
                check_file.write("{}\t\t{}\n".format(name, author))
                guessed = Guessed(art, [name], article.id)
                guesses.append((len(overlap), guessed)) # saved the guessed name and the overlap
            

                    
            #simplify the set 'possible_names' due to slightly different characters (e.g., accents)
            
            tmp = list(possible_names)
            tmp2 = []

            for i in range(len(possible_names)-1):
 
                highest = process.extractOne(tmp[i],tmp[i+1:])
                if highest[1] > 85:
                    continue
                tmp2.append(tmp[i])

            if len(tmp)!=0:
                tmp2.append(tmp[-1])

            possible_names = set(tmp2)   

                
        
            if len(guesses) == 0: #if no guesses, check if there is only one author in the field with the last name
                if len(possible_names)==1:
                    check_file.write("{}\t\t{}\n".format(list(possible_names)[0], author))
                    guessed = Guessed(art, [list(possible_names)[0]], article.id)
                    guesses.append((len(overlap), guessed))
                else:
                    guessed_authors.append(author)
                    unmapped_names.add(author)
                    count += 1
                    continue

            guesses = sorted(guesses, key=lambda x: x[0], reverse=True) #This does not change guesses if we do not assign it to variable guesses
            the_guess = guesses[0][1]
            count_guess += 1
            guessed_authors.append(the_guess.guessed_names[0])
            
            
        final_guess = Guessed(art, guessed_authors, the_guess.match_article_id)    
        outfile_name = final_guess.id[:80]+".json"
        with codecs.open(GUESSED_NAMES_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(final_guess.to_map()))

check_file.close()
print('Names not guessed:',len(unmapped_names))
print('Number of authors not guessed:', count)

print('Number of authors guessed:', count_guess)



#### Start with the name --> gender: test genderize.io api

In [None]:
# class for gender result and test
class GenderResult(object):
    def __init__(self, name, result):
        self.name = name
        self.binary = result[0]
        self.percent = result[1]
        self.count = result[2]
        
    def to_map(self):
        m = {}
        
        m["name"] = self.name
        m["binary"] = self.binary
        m["percent"] = self.percent
        m["count"] = self.count
        return m
    

def gender_result_from_map(m):
    return GenderResult(m["name"], ( m["binary"], m["percent"], m["count"]))

name = "Ismael"
g = getGenders(name)[0]
gr = GenderResult(name, g)
gender_result_from_map(json.loads(json.dumps(gr.to_map()))).to_map()

#### Just read in all the previously determined and stored name --> gender pairs

In [None]:
# read the Gender API gender guesses into memory

gender_results = []

#for _, _, files in #os.walk(NAME_GENDER_DIR):
files = glob(os.path.join(NAME_GENDER_DIR, "*.json"))
for file in files:
    with codecs.open(file, "r", "utf8") as infile:
        gr = gender_result_from_map(json.loads(infile.read()))
        gender_results.append(gr)
            
        
    
# Create a map of first name to gender result
name_to_gr = {}
for gr in gender_results:
    name_to_gr[gr.name] = gr
    
print("Length of current name-gender map", len(name_to_gr))

# test
print("Colin." in name_to_gr)
print("First name in current gender map, gender: ", \
      list(name_to_gr)[0], name_to_gr[list(name_to_gr)[0]].binary)
print("Last name in current gender map, gender: ", \
      list(name_to_gr)[-1], name_to_gr[list(name_to_gr)[-1]].binary)

#### Genderize all the names
#### - collect the list of first names from PARSED and GUESSED
#### - check if it is in the list already, if not: call genderize io
#### - finally check how many in the guessed initialed names are male / female

In [None]:
# Collect list of all PARSED and GUESSED names to be gendered

first_names = set()


## TODO:  Adapt here for all authors instead for first author only. I think this is done now?? Can someone double-check it?
files_guessed = glob(os.path.join(GUESSED_NAMES_DIR, "*.json"))
files_parsed = glob(os.path.join(PARSED_PAGES_DIR, "*.json"))

for file in files_guessed:
    with codecs.open(file, "r", "utf8") as infile:
        guess = guessed_from_map(json.loads(infile.read()))
        
    for name in guess.guessed_names:
        first_name = name.split(" ")[0]
        if len(first_name) == 0 or is_initialed_name(name): #if it is initialed nothing to do
            print("empty or initialed name?:", name)
            continue
        first_names.add(first_name)


for file in files_parsed:
    with codecs.open(file, "r", "utf8") as infile:
        art = article_from_map(json.loads(infile.read()))
        
    for name in art.names:
        first_name = name.split(" ")[0]
        if len(first_name) == 0 or is_initialed_name(name):  #if it is initialed nothing to do
            print("empty or initialed name?:", name)
            continue
        first_names.add(first_name)

print("number of unique first names of all authors to genderize:", len(first_names))
print("From {} to {}.".format(list(first_names)[0],
                              list(first_names)[-1]))        

In [None]:
# Call the Gender API and save output for all the GUESSED names
# WARN: Makes API calls
# Does nothing if the guessed names were genderized previously

print("starting size:", len(name_to_gr))

nnames = 0
to_genderize = []
for i, name in enumerate(first_names):

    if name in name_to_gr:
        #print("known, ", name)
        continue
    else:
        print("new, ", name)
        to_genderize.append(name)

    nnames += 1
    
    if nnames == 10: # guess the gender of 10 names simultaneously
        result = getGenders(to_genderize)
    #         if len(result) > 1:
    #             print("long result:", result)
        for j, pres in enumerate(result):
            r = pres
            gr = GenderResult(to_genderize[j], r)

            # update gender name map in memory
            name_to_gr[gr.name] = gr
            print(name_to_gr[gr.name].name, name_to_gr[gr.name].binary)

            # also save for later use
            file_name = "_".join(to_genderize[j].translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
            
            with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(gr.to_map()))
        nnames = 0
        to_genderize = []

if nnames > 0:
    result = getGenders(to_genderize)
    for j, pres in enumerate(result):
        r = pres
        gr = GenderResult(to_genderize[j], r)

        # update gender name map in memory
        name_to_gr[gr.name] = gr
        print(name_to_gr[gr.name].name, name_to_gr[gr.name].binary)

        # also save for later use
        file_name = "_".join(to_genderize[j].translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
        
        with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(gr.to_map()))
    nnames = 0
    to_genderize = []       
    
print("finish size:", len(name_to_gr))

print(name_to_gr["Weitao"])

In [None]:
# # Determine how many male/female/unknown names exist in the guessed names
# # NOTE: This ratio should be for all guesses, not the reduced set() of names

# binary_ratio_first_names = {'male':0, 'female':0, 'None':0}

# for _, _, files in os.walk(GUESSED_NAMES_DIR):
#     for file in files:

#         with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
#             guess = guessed_from_map(json.loads(infile.read()))
#         first_name = guess.guessed_names[0].split(" ")[0]
#         if len(first_name) == 0:
#             print("empty first name?:", guess.guessed_names[0])
#             continue
#         try:
#             gr = name_to_gr[first_name]
#             binary_ratio_first_names[gr.binary] += 1
#         except KeyError:
#             print("not found in map:", first_name)
        
        
# binary_ratio_first_names


#### Now, let's create a json database that can be used in another notebook.

In [None]:
# Go both through the guessed names and the non-initialed names.
# Find genders on the base of the name_to_gr map and save the results


AUTHOR_GENDER_DIR = root + "/author_genders/"
if not os.path.exists(AUTHOR_GENDER_DIR):
    os.mkdir(AUTHOR_GENDER_DIR)

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:       
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
            
        if contains_initialed_name(art.names):
            continue
            
        all_names = art.names
        all_genders = []
        all_percent = []

        for n in all_names:
            n = n.split()[0]
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(float(name_to_gr[n].percent))
            except KeyError:
                print(n)
                #raise KeyError("Check name_to_gr database, it should be complete by now!")
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
       
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))
# now do the guessed names
for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:       
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            art = guessed_from_map(json.loads(infile.read()))
            
        all_names = art.guessed_names
        all_genders = []
        all_percent = []

        for n in all_names:
            if is_initialed_name(n):
                all_genders.append("init")
                all_percent.append(None)
                #print(n)
                continue

            n = n.split()[0]            
            
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(float(name_to_gr[n].percent))
            except KeyError:
                print(n)
                #raise KeyError("Name {} is not gendered yet".format(n))
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))

In [None]:

files = glob(os.path.join(AUTHOR_GENDER_DIR, "*.json"))
count = 0
count_f = 0
count_m = 0
for file in files:
    outfile_name = os.path.basename(file)
    
    #print("ATTENTION nmax set")
    with codecs.open(file, "r", "utf8") as infile:
        art = gendered_from_map(json.loads(infile.read()))

    
    # Check if any name has gender == None
    
    for i, name in enumerate(art.names):
        if art.all_genders[i] == "init":
            count += 1
            continue
        
        if art.all_genders[i] != "male" and art.all_genders[i] != "female":
            count += 1
            
        if art.all_genders[i] == "male":
            count_m += 1
            
        if art.all_genders[i] == "female":
            count_f += 1
            
print('Total of uncathegorized names:', count)
print('Total of female author names:', count_f)
print('Total of male author names:', count_m)

### Re-do guessing for those that came out as None from the namsor database (good for chinese names)


In [None]:
# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = 'd33fd630c4c66b498360a5e23a33f3b8'

# create an instance of the API class
api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))
first_name = 'Ray Y.' # str | 
last_name = 'Chuang' # str | 

try:
    # Infer the likely gender of a name.
    api_response = api_instance.gender(first_name, last_name)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling PersonalApi->gender: %s\n" % e)
    

In [None]:
# loop over author_gender
AUTHOR_ALLGENDER_DIR = root + "/author_allgenders"
if not os.path.exists(AUTHOR_ALLGENDER_DIR):
    os.mkdir(AUTHOR_ALLGENDER_DIR)

files = glob(os.path.join(AUTHOR_GENDER_DIR, "*.json"))
nmax = 10000
ncall = 0
for file in files:
    outfile_name = os.path.basename(file)
    if os.path.exists(os.path.join(AUTHOR_ALLGENDER_DIR, outfile_name)):
        continue
    
    #print("ATTENTION nmax set")
    with codecs.open(file, "r", "utf8") as infile:
        art = gendered_from_map(json.loads(infile.read()))

    
    # Check if any name has gender == None
    
    for i, name in enumerate(art.names):
        if art.all_genders[i] == "init":
            continue
        
        if art.all_genders[i] == "None":
            # print(art.names, art.all_genders, art.all_percent)
            
            # call namsor
            first_name = name.split()[0] 
            last_name = name.split()[-1]
            try:
                # Infer the likely gender of a name.
                api_response = api_instance.gender(first_name, last_name)
                # pprint(api_response)
                gender = api_response.likely_gender
                prob = api_response.probability_calibrated
                
                art.all_genders[i] = gender
                art.all_percent[i] = prob

            except ApiException as e:
                print("Exception when calling PersonalApi->gender: %s\n" % e)
            #print(art.names, art.all_genders, art.all_percent)
            
            #print("=========================")
            
            ncall += 1
            print(ncall, end=",")
        
    if ncall > nmax:
        break
        
    # overwrite the file
    art_out = art.to_map()

    # save!
    with codecs.open(os.path.join(AUTHOR_ALLGENDER_DIR, outfile_name), "w", "utf8") as outfile:
        outfile.write(json.dumps(art_out))        




In [None]:
AUTHOR_ALLGENDER_DIR = root + "/author_allgenders"

files = glob(os.path.join(AUTHOR_ALLGENDER_DIR, "*.json"))
count = 0
count_f = 0
count_m = 0
for file in files:
    outfile_name = os.path.basename(file)
    
    #print("ATTENTION nmax set")
    with codecs.open(file, "r", "utf8") as infile:
        art = gendered_from_map(json.loads(infile.read()))

    
    # Check if any name has gender == None
    
    for i, name in enumerate(art.names):
        if art.all_genders[i] == "init":
            count += 1
            continue
        
        if art.all_genders[i] != "male" and art.all_genders[i] != "female":
            count += 1
            
        if art.all_genders[i] == "male":
            print(name,art.all_genders[i],art.all_percent[i])
            count_m += 1
            
        if art.all_genders[i] == "female":
            print(name,art.all_genders[i],art.all_percent[i])
            count_f += 1
            
print('Total of uncathegorized names:', count)
print('Total of female author names:', count_f)
print('Total of male author names:', count_m)

In [None]:
# the following part may be moved to a new notebook. It is from the analysis of Pico et al. 2020-----

In [None]:
# Determine how many male/female/unknown names exist in the guessed names
# printed out for each journal.


journal_names = [
"Tectonophysics",
"Physics+of+the+Earth+and+Planetary+Interiors",
"Earth+and+Planetary+Science+Letters",
"SolidEarth",
"GEOPHYSICS",
"NatureGeoscience",
"GRL",
"JGRSolidEarth",
"G3",
"GJI",
"Nature",
"Bulletin+of+the+Seismological+Society+of+America",
"Seismological+Research+Letters"]


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    for _, _, files in os.walk(GUESSED_NAMES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
                guess = guessed_from_map(json.loads(infile.read()))
            first_name = guess.guessed_name.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", guess.guessed_name)
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names)

In [None]:
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.


for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
            first_name = article.first_author.split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                initialed_name_count += 1
                continue
            try:
                gr = name_to_gr[first_name]
                binary_ratio_first_names[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


    print(journal, binary_ratio_first_names, "initialed name count:", initialed_name_count)

In [None]:
# Same as above but split into year/month bins
#
# Determine how many male/female/unknown names exist for the un-initialed names
# printed out for each journal.

journal_tallys = {}

for journal in journal_names:
    binary_ratio_first_names = {'male':0, 'female':0, 'None':0, 'initialed':0}
    
    initialed_name_count = 0
    
    for _, _, files in os.walk(PARSED_PAGES_DIR):
        for file in files:
            
            if not file.startswith(journal):
                continue
                
            with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
                article = article_from_map(json.loads(infile.read()))
                
            tally = journal_tallys.setdefault(journal, {}).setdefault(article.year, {}).setdefault(article.month, {'male':0, 'female':0, 'None':0, 'initialed':0})
            
            first_name = article.first_author.strip().split(" ")[0]
            if len(first_name) == 0:
                print("empty first name?:", article.first_author)
                continue
            if is_initialed_name(first_name):
                tally['initialed'] += 1
                continue
            try:
                gr = name_to_gr[first_name]
                tally[gr.binary] += 1
            except KeyError:
                print("not found in map:", first_name)


for journal, values in journal_tallys.items():
    print(journal, values)