In [None]:
import re

from bs4 import BeautifulSoup

import selenium
from selenium import webdriver

import time

import gender
from gender import getGenders

import codecs

In [None]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

CHROME_DRIVER_PATH=root+"/deps/chromedriver"

In [None]:
! ls

In [None]:
class Page:
    def __init__(self, raw, month, year, journal_id):
        self.raw = raw
        self.month = month
        self.year = year
        self.journal_id = journal_id
        


In [None]:
# Initialize webdriver

option = webdriver.ChromeOptions()
option.add_argument("--incognito")
browser = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=option)

#### AGU Journals: GRL, JGR: Solid Earth, G3

In [None]:
# Setup to scrape AGU pages, run a test

# RESTRICT TO PAPERS WITH TERMS IN ABSTRACT: seism, earthquake
template_string = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=Abstract&text1=seism+OR+earthquake&field2=AllField&text2=&field3=AllField&text3=&publication[]=21699356&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"
# All papers
# template_string = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=AllField&text1=&field2=AllField&text2=&field3=AllField&text3=&publication[]=21698996&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"


def fetch_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("publication_title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = template_string.format(year="2013", month="05", start_page=1)
html, fingerprint = fetch_page(browser, url)

with codecs.open("test/test_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    



In [None]:
# Journals to scrape. [name, template]

# RESTRICT TO PAPERS WITH TERMS IN ABSTRACT: seism, earthquake
agu_search_template = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=Abstract&text1=seism+OR+earthquake&field2=AllField&text2=&field3=AllField&text3=&publication[]={publication}&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"
# All papers
# agu_search_template = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=AllField&text1=&field2=AllField&text2=&field3=AllField&text3=&publication[]={publication}&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"


journal_templates = [
    #["JGRAtmosphere", "21698996"],
    #["JGREarthSurface","21699011"],
    ["GRL","19448007"],
    #["JGROceans","21699291"],
    ["JGRSolidEarth","21699356"],
    ["G3","15252027"],
    #["JGRSpacePhysics","21699402"],
    #["JGRBioGeoSciences","21698961"],
    #["JGRPlanets","21699100"],
]

In [None]:
# years to collect
years = [
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
#    "2021"
    
]

# months to collect
months = [
    "01",
    "02",
    "03",
    "04",
    "05",
    "06",
    "07",
    "08",
    "09",
    "10",
    "11",
    "12"
]

days_in_month = [
    "31",
    "28",
    "31",
    "30",
    "31",
    "30",
    "31",
    "31",
    "30",
    "31",
    "30",
    "31"
]

In [None]:
# Collect the pages

previous_fingerprint = ""


for journal_name, journal_id in journal_templates:
    print("scraping journal:", journal_name)
    for year in years:
        print("scraping year:", year)
        for month in months:
            print("scraping month:", month)
            for page in range(1,101):
                url = agu_search_template.format(publication=journal_id, year=year, month=month, start_page=page)
                html, fingerprint = fetch_page(browser, url)
                if len(html) == 0:
                    print("nothing to save for", url)
                    continue
                if fingerprint == previous_fingerprint:
                    # page already seen, move to the next month
                    break
                previous_fingerprint = fingerprint

                filename = "{name}_{year}_{month}_{page}.html".format(name=journal_name, year=year, month=month, page=page)
                with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                    outfile.write(html)

                time.sleep(2)
            

#### Geophysical Journal International 

In [None]:
# template url

gji_template = "https://academic.oup.com/gji/search-results?sort=Date+%e2%80%93+Newest+First&f_TocHeadingTitle=Seismology&f_ArticleTypeDisplayName=Review+ArticleANDResearch+Article&fl_SiteID=5282&rg_ArticleDate={month}/01/{year}%20TO%20{month}/{last_day}/{year}&dateFilterType=range&noDateTypes=true&qb=%7b%22q%22%3a%22%22%7d&page={start_page}"

In [None]:
# Setup to scrape GJI pages, run a test

def fetch_gji_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("article-link")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = gji_template.format(year="2013", month="01", last_day="31", start_page=1)
html, fingerprint = fetch_gji_page(browser, url)

with codecs.open("test/test_gji_page.html", "w", "utf8") as outfile:
    outfile.write(html)

In [None]:
# Scrape GJI


previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for i, month in enumerate(months):
        print("scraping month:", month)
        last_day = days_in_month[i]
        for page in range(1,101):
            url = gji_template.format(
                year=year,
                month=month,
                last_day=last_day,
                start_page=page)
            html, fingerprint = fetch_gji_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            previous_fingerprint = fingerprint

            filename = "GJI_{year}_{month}_{page}.html".format(year=year, month=month, page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)
                

#### GEOPHYSICS

In [None]:
# template

geophysics_template = "https://library.seg.org/action/doSearch?field1=Abstract&text1=seism*+OR+earthquake&publication=&publication%5B%5D=gpysa7&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest"

# Be careful: page number starts from 0 for this journal

In [None]:
# Setup to scrape Geophysics pages, run a test

def fetch_geophysics_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("issue-item__title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = geophysics_template.format(year="2013", month="01", start_page=0)
html, fingerprint = fetch_geophysics_page(browser, url)

with codecs.open("test/test_geophysics_page.html", "w", "utf8") as outfile:
    outfile.write(html)

In [None]:
# Scrape Geophysics


previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for i, month in enumerate(months):
        print("scraping month:", month)
        for page in range(100):
            url = geophysics_template.format(
                year=year,
                month=month,
                start_page=page)
            html, fingerprint = fetch_geophysics_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            previous_fingerprint = fingerprint

            filename = "GEOPHYSICS_{year}_{month}_{page}.html".format(year=year, month=month, page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)
                

#### GeoScienceWorld journal: BSSA and SRL

In [None]:
# GeoScienceWorld journal values

gsw_template = "https://pubs.geoscienceworld.org/search-results?page={page_number}&f_JournalDisplayName={journal_name}&fl_ContentType=Journal+Article+OR+Journal+OR+Book+OR+Book+Chapter+OR+GeoRef+Record&fl_JournalID={publication}&rg_PublicationDate={month}%2f{day}%2f{year}+TO+{month}%2f{last_day}%2f{year}&restypeid=3&f_ArticleTypeDisplayName=Research+Article" 

gsw_journal_values = [
#    ["Geology", "33"],
#    ["GSA+Bulletin", "35"]
    ["Bulletin+of+the+Seismological+Society+of+America","66"],
    ["Seismological+Research+Letters","65"]
]

In [None]:
# Setup to scrape GSW pages

def fetch_gsw_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("al-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = gsw_template.format(
                    journal_name="Seismological+Research+Letters",
                    publication="65",
                    year="2016",
                    month="03",
                    day="01",
                    last_day="31",
                    page_number=1)
html, fingerprint = fetch_gsw_page(browser, url)

with codecs.open("test/test_gsw_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    



In [None]:
# Scrape GeoScienceWorld



previous_fingerprint = ""

for journal_name, journal_id in gsw_journal_values:
    print("scraping journal:", journal_name)
    for year in years:
        print("scraping year:", year)
        for i, month in enumerate(months):
            print("scraping month:", month)
            last_day = days_in_month[i]
            for page in range(1,101):
                url = gsw_template.format(
                    journal_name=journal_name,
                    publication=journal_id,
                    year=year,
                    month=month,
                    day="01",
                    last_day=last_day,
                    page_number=page)
                html, fingerprint = fetch_gsw_page(browser, url)
                if len(html) == 0:
                    print("nothing to save for", url)
                    continue
                if fingerprint == previous_fingerprint:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                previous_fingerprint = fingerprint

                filename = "{name}_{year}_{month}_{page}.html".format(name=journal_name, year=year, month=month, page=page)
                with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                    outfile.write(html)

                time.sleep(2)
                

#### Nature geoscience

In [None]:
# setup to scrape nature geoscience

ngs_template = "https://www.nature.com/search?article_type=research%2Creviews&date_range={year}-{year}&journal=ngeo&subject=seismology&order=relevance&page={page}" 


def fetch_ngs_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("h3")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = ngs_template.format(year="2015", page=1)
html, fingerprint = fetch_ngs_page(browser, url)

with codecs.open("test/test_ngs_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
print("fingerprint:", fingerprint)


In [None]:
# scrape nature geoscience

previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = ngs_template.format(year=year, page=page)
        html, fingerprint = fetch_ngs_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="NatureGeoscience", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


#### Nature

In [None]:
# setup to scrape nature 

nature_template = "https://www.nature.com/search?article_type=research%2Creviews&date_range={year}-{year}&journal=nature&subject=seismology&order=relevance&page={page}" 


def fetch_nature_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("h3")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = nature_template.format(year="2015", page=1)
html, fingerprint = fetch_nature_page(browser, url)

with codecs.open("test/test_nature_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
print("fingerprint:", fingerprint)


In [None]:
# scrape nature 

previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = nature_template.format(year=year, page=page)
        html, fingerprint = fetch_nature_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="Nature", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


#### Solid Earth

In [None]:
se_template = "https://editor.copernicus.org/search.php?abstract=earthquake+seism&startYear={year}&endYear={year}&paperVersion=final&journal=431&page={page}"

def fetch_se_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("article-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = se_template.format(year="2015", page=2)
html, fingerprint = fetch_se_page(browser, url)

with codecs.open("test/test_se_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
print("fingerprint:", fingerprint)

In [None]:
# scrape Solid Earth 

previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = se_template.format(year=year, page=page)
        html, fingerprint = fetch_se_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="SolidEarth", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


#### Science

In [None]:
# setup for science direct scraping

# note: offset is (page-1)*100
science_direct_template = "https://www.sciencedirect.com/search/advanced?pub={journal}&cid={journal_id}&date={year}&articleTypes=REV%2CFLA&show=100&sortBy=relevance&offset={offset}"

sd_journal_values = [
    ["Quaternary%20Science%20Reviews", "271861"],
    ["Geochimica%20et%20Cosmochimica%20Acta", "271865"]
]


def fetch_sd_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("result-list-title-link")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

journal, journal_id = sd_journal_values[0]
url = science_direct_template.format(journal=journal, journal_id=journal_id, year="2015", offset=0)
html, fingerprint = fetch_sd_page(browser, url)

with codecs.open("test/test_sd_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
print("fingerprint:", fingerprint)

In [None]:
# scrape science direct

for journal, journal_id in sd_journal_values:
    for year in years:
        print("scraping year:", year)
        for page in range(1,101):
            url = science_direct_template.format(journal=journal, journal_id=journal_id, year=year, offset=((page-1)*100))
            html, fingerprint = fetch_sd_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            previous_fingerprint = fingerprint

            filename = "{name}_{year}_{month}_{page}.html".format(name=journal, year=year, month="0", page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)

In [None]:
# Code copied from scrape_webjournals


template_string = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=AllField&text1=&field2=AllField&text2=&field3=AllField&text3=&publication[]=21698996&Ppub=&AfterMonth=05&AfterYear={after_year}&BeforeMonth=05&BeforeYear={before_year}&startPage={start_page}&sortBy=Earliest"

names = []

names_from_one_page = []
names_string = "the-place-holder"

for year in range(2013,2019):
    print("scraping year", year)
    for i in range(1,101):
        names_from_one_page = []
        # BUG: The last result page repeats itself, so we need a different way to detect when
        # we've scraped the last page of good results. Either: Find the number of results and
        # calculate the number of pages, or compare the last list of names with the current and
        # stop when they match.
        print("page", i)
        browser.get(
            template_string.format(
                after_year=year,
                before_year=year+1,
                start_page=i
            )
        )
        author_divs = browser.find_elements_by_class_name("meta__authors")
        if len(author_divs) == 0:
            print("ending on page", i, "for year", year)
            break
        for c in author_divs:
            try:
                name = c.find_element_by_class_name("hlFld-ContribAuthor").text
                names_from_one_page.append(name)
            except:
                print("failed to find class name:", repr(c.text))
                
        if len(names_from_one_page) == 0:
            continue
    
        new_name_string = "".join(names_from_one_page)
        if names_string == new_name_string:
            print("ending on page", i, "for year", year, "because the page repeated")
            break
        names_string = new_name_string
        
        # We check that the page was not repeated before saving any of the 
        # names in the _names_ list.
        for n in names_from_one_page:
            names.append(n)
            
        print("found names so far:", len(names))
        time.sleep(2)
        