In [None]:
import re

from bs4 import BeautifulSoup

import selenium
from selenium import webdriver

import time

import gender
from gender import getGenders

import codecs

import os

import string
import json

In [None]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

CHROME_DRIVER_PATH=root+"/deps/chromedriver"

In [None]:
EGU_DIR=root+"/egu/"
PARSED_EGU_DIR =root+"/egu_parsed/"

if not os.path.exists(EGU_DIR):
    os.mkdir(EGU_DIR) 
    
if not os.path.exists(PARSED_EGU_DIR):
    os.mkdir(PARSED_EGU_DIR) 

In [None]:
# Initialize webdriver
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
browser = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=option)

In [None]:
# years to collect
years = [
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
#    "2021"
    
]


In [None]:
# Initialize and define useful functions and
# data structures.

def is_initialed_name(name):
    first_term = name.split(" ")[0]
    if len(first_term) == 0:
        # print("first term len zero. name:", name)
        return False
    return first_term[-1] == "." and first_term[:-1].isupper()

print("test is_initialed_name- True:", is_initialed_name("J. Smith"), ", False:", is_initialed_name("Joe Smith"))

def contains_initialed_name(names):
    for name in names:
        if is_initialed_name(name):
            return True
    return False

print("test contains_initialed_name- True", contains_initialed_name(["J. Smith", "Cat Meowins"]), ", False:", contains_initialed_name(["Joe Smith", "Cat Meowins"]))

def clean_name(name):
    name = name.strip()
    name = re.sub(r'\s', ' ', name)
    if not is_initialed_name(name):
        terms = name.split(" ")
        terms[0] = terms[0].strip(".")
        name = " ".join(terms)
    return name.strip('.')

print("test clean name- Colin. J. Cats:", clean_name("Colin. J. Cats"), "W. B. Easy:", clean_name("W. B. Easy"))


class Article:
    def __init__(self, first_author, all_names, year, month, title, journal):
        # clean the author names
        # - remove non-ascii whitespace
        # - strip bookend whitespace
        # - strip periods from first names if not an initialed name
        
        
        self.first_author = clean_name(first_author)
        self.names = [clean_name(name) for name in all_names]
        self.year = year
        self.month = month
        self.title = title
        self.journal = journal
        
        # create a unique identifier for this article
        self.id = "_".join([journal, year, month, "_".join(title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(first_author)
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        return m

In [None]:
def fetch_egu_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    time.sleep(2)
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("s-results-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = egu_template.format(year="2013", page=0)
html, fingerprint = fetch_egu_page(browser, url)

#print(fingerprint)

In [None]:

def parse_egu_page(soup, year):
    parsed_articles = []
    
    articles = soup.find_all("div", class_="col-sm-12")

    for a in articles:
        title = a.find_all("h3", class_="s-results-title")
        if title == "None":
            #print("hit NoneType title")
            continue
        if len(title) == 0:
            #print("hit empty title")
            continue
        title = title[0]
        title = str(title.get_text())

        authors = []
        author_field = a.find("ul", class_="all-authors")
        author_list = author_field.find_all("li", class_="article-author")
        for p in author_list:
            author = p.get_text().strip(";")
            name = author.split(",")[::-1]
            name = " ".join(name).strip()
            authors.append(str(name))
   
        if len(authors) == 0:
            print("hit empty authors. title:", title)
            continue

        article = Article(
                first_author=authors[0],
                all_names=authors,
                year=year,
                month="0",
                title=title,
                journal="EGU_General_Assembly")
        parsed_articles.append(article)
    return parsed_articles


In [None]:

egu_template = "https://ui.adsabs.harvard.edu/search/filter_bibstem_facet_fq_bibstem_facet=AND&filter_bibstem_facet_fq_bibstem_facet=bibstem_facet%3A%22EGUGA%22&fq=%7B!type%3Daqp%20v%3D%24fq_bibstem_facet%7D&fq_bibstem_facet=(bibstem_facet%3A%22EGUGA%22)&q=%20abs%3A(seism%20OR%20earthquake)%20%20year%3A{year}&sort=date%20desc%2C%20bibcode%20desc&p_={page}"

#page starts from 0


previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(151):
        print("page:", page)
        url = egu_template.format(year=year, page=page)
        html, fingerprint = fetch_egu_page(browser, url)

        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            print("no articles found")
            break
        
        previous_fingerprint = fingerprint
        
        soup = BeautifulSoup(html, "html.parser")
        
        for article in parse_egu_page(soup, year):
            outfile_name = article.id[:80]+".json"
            with codecs.open(PARSED_EGU_DIR+outfile_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(article.to_map()))
                
        time.sleep(2)        
            
print("Done!")
        
       
