In [1]:
import re

from bs4 import BeautifulSoup

import selenium
from selenium import webdriver

import time

import gender
from gender import getGenders

import codecs

import os

In [60]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

CHROME_DRIVER_PATH=root+"/deps/chromedriver"

using root directory: /home/lermert/code/geoscience-first-authorship


In [61]:
! ls

 deps	        parsed			    README.md
 gender.py     'Parse Journal Data.ipynb'   requirements.txt
 guessed        parsejournals.ipynb	   'Scrape Journals Save Pages.ipynb'
 name_genders   picos			    test
 pages	        __pycache__


In [62]:
if not os.path.exists(root+"/pages"):
    os.mkdir(root+"/pages") 

In [63]:
class Page:
    def __init__(self, raw, month, year, journal_id):
        self.raw = raw
        self.month = month
        self.year = year
        self.journal_id = journal_id
        


In [64]:
# Initialize webdriver
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
browser = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=option)

In [56]:
# years to collect
years = [
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021"
    
]

# months to collect
months = [
    "01",
    "02",
    "03",
    "04",
    "05",
    "06",
    "07",
    "08",
    "09",
    "10",
    "11",
    "12"
]

days_in_month = [
    "31",
    "28",
    "31",
    "30",
    "31",
    "30",
    "31",
    "31",
    "30",
    "31",
    "30",
    "31"
]

#### AGU Journals: GRL, JGR: Solid Earth, G3

In [13]:
# Journals to scrape. [name, template]

# RESTRICT TO PAPERS WITH TERMS IN ABSTRACT: seism, earthquake
agu_search_template = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=Abstract&text1=seism+OR+earthquake&field2=AllField&text2=&field3=AllField&text3=&publication[]={publication}&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"
# All papers
# agu_search_template = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=AllField&text1=&field2=AllField&text2=&field3=AllField&text3=&publication[]={publication}&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"


journal_templates = [
    ["GRL","19448007"],
    ["JGRSolidEarth","21699356"],
    ["G3","15252027"],
]

In [14]:
# Setup to scrape AGU pages, run a test

def fetch_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("publication_title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = agu_search_template.format(publication="G3", year="2013", month="05", start_page=1)
html, fingerprint = fetch_page(browser, url)

with codecs.open("test/test_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    



In [15]:
# Collect the pages

previous_fingerprint = ""


for journal_name, journal_id in journal_templates:
    print("scraping journal:", journal_name)
    for year in years:
        print("scraping year:", year)
        for month in months:
            print("scraping month:", month)
            for page in range(1,101):
                url = agu_search_template.format(publication=journal_id, year=year, month=month, start_page=page)
                html, fingerprint = fetch_page(browser, url)
                if len(html) == 0:
                    print("nothing to save for", url)
                    continue
                if fingerprint == previous_fingerprint:
                    print("done on page", page)
                    # page already seen, move to the next month
                    break
                if len(fingerprint) == 0:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                previous_fingerprint = fingerprint

                filename = "{name}_{year}_{month}_{page}.html".format(name=journal_name, year=year, month=month, page=page)
                with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                    outfile.write(html)

                time.sleep(2)
            

scraping journal: GRL
scraping year: 2021
scraping month: 01
scraping journal: JGRSolidEarth
scraping year: 2021
scraping month: 01
scraping journal: G3
scraping year: 2021
scraping month: 01


In [16]:
! ls -ltr pages | tail -3

-rw-rw-r-- 1 lermert lermert 310141 May  1 14:50 GRL_2021_01_1.html
-rw-rw-r-- 1 lermert lermert 350380 May  1 14:50 JGRSolidEarth_2021_01_1.html
-rw-rw-r-- 1 lermert lermert 330139 May  1 14:50 G3_2021_01_1.html


#### Geophysical Journal International 

In [57]:
# template url

gji_template = "https://academic.oup.com/gji/search-results?sort=Date+%e2%80%93+Newest+First&f_TocHeadingTitle=Seismology&f_ArticleTypeDisplayName=Review+ArticleANDResearch+Article&fl_SiteID=5282&rg_ArticleDate={month}/01/{year}%20TO%20{month}/{last_day}/{year}&dateFilterType=range&noDateTypes=true&qb=%7b%22q%22%3a%22%22%7d&page={start_page}"

In [58]:
# Setup to scrape GJI pages, run a test

def fetch_gji_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("article-link")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = gji_template.format(year="2013", month="01", last_day="31", start_page=1)
html, fingerprint = fetch_gji_page(browser, url)

with codecs.open("test/test_gji_page.html", "w", "utf8") as outfile:
    outfile.write(html)

unexpected page url.
 current: https://academic.oup.com/crawlprevention/governor?content=%2fgji%2fsearch-results%3fsort%3dDate%2b%25E2%2580%2593%2bNewest%2bFirst%26f_TocHeadingTitle%3dSeismology%26f_ArticleTypeDisplayName%3dReview%2bArticleANDResearch%2bArticle%26fl_SiteID%3d5282%26rg_ArticleDate%3d01%2f01%2f2013%2520TO%252001%2f31%2f2013%26dateFilterType%3drange%26noDateTypes%3dtrue%26qb%3d%257B%2522q%2522%253a%2522%2522%257D%26page%3d1 
 expected: https://academic.oup.com/gji/search-results?sort=Date+%e2%80%93+Newest+First&f_TocHeadingTitle=Seismology&f_ArticleTypeDisplayName=Review+ArticleANDResearch+Article&fl_SiteID=5282&rg_ArticleDate=01/01/2013%20TO%2001/31/2013&dateFilterType=range&noDateTypes=true&qb=%7b%22q%22%3a%22%22%7d&page=1


In [65]:
# Scrape GJI


previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for i, month in enumerate(months):
        print("scraping month:", month)
        last_day = days_in_month[i]
        for page in range(1,101):
            url = gji_template.format(
                year=year,
                month=month,
                last_day=last_day,
                start_page=page)
            html, fingerprint = fetch_gji_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            if len(fingerprint) == 0:
                # page already seen, move to the next month
                print("done on page", page)
                break
            previous_fingerprint = fingerprint

            filename = "GJI_{year}_{month}_{page}.html".format(year=year, month=month, page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(5.5)
                

scraping year: 2010
scraping month: 01
unexpected page url.
 current: https://academic.oup.com/crawlprevention/governor?content=%2fgji%2fsearch-results%3fsort%3dDate%2b%25E2%2580%2593%2bNewest%2bFirst%26f_TocHeadingTitle%3dSeismology%26f_ArticleTypeDisplayName%3dReview%2bArticleANDResearch%2bArticle%26fl_SiteID%3d5282%26rg_ArticleDate%3d01%2f01%2f2010%2520TO%252001%2f31%2f2010%26dateFilterType%3drange%26noDateTypes%3dtrue%26qb%3d%257B%2522q%2522%253a%2522%2522%257D%26page%3d1 
 expected: https://academic.oup.com/gji/search-results?sort=Date+%e2%80%93+Newest+First&f_TocHeadingTitle=Seismology&f_ArticleTypeDisplayName=Review+ArticleANDResearch+Article&fl_SiteID=5282&rg_ArticleDate=01/01/2010%20TO%2001/31/2010&dateFilterType=range&noDateTypes=true&qb=%7b%22q%22%3a%22%22%7d&page=1
nothing to save for https://academic.oup.com/gji/search-results?sort=Date+%e2%80%93+Newest+First&f_TocHeadingTitle=Seismology&f_ArticleTypeDisplayName=Review+ArticleANDResearch+Article&fl_SiteID=5282&rg_ArticleDa

KeyboardInterrupt: 

#### GEOPHYSICS

In [34]:
# template

geophysics_template = "https://library.seg.org/action/doSearch?field1=Abstract&text1=seism*+OR+earthquake&publication=&publication%5B%5D=gpysa7&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest"

# Be careful: page number starts from 0 for this journal

In [35]:
# Setup to scrape Geophysics pages, run a test

def fetch_geophysics_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("issue-item__title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = geophysics_template.format(year="2013", month="01", start_page=0)
html, fingerprint = fetch_geophysics_page(browser, url)

with codecs.open("test/test_geophysics_page.html", "w", "utf8") as outfile:
    outfile.write(html)

unexpected page url.
 current: https://library.seg.org/action/doSearch?field1=Abstract&text1=seism*+OR+earthquake&publication=&publication%5B%5D=gpysa7&Ppub=&AfterMonth=01&AfterYear=2013&BeforeMonth=01&BeforeYear=2013&startPage=0&sortBy=Earliest& 
 expected: https://library.seg.org/action/doSearch?field1=Abstract&text1=seism*+OR+earthquake&publication=&publication%5B%5D=gpysa7&Ppub=&AfterMonth=01&AfterYear=2013&BeforeMonth=01&BeforeYear=2013&startPage=0&sortBy=Earliest


In [37]:
# Scrape Geophysics
previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for i, month in enumerate(months):
        print("scraping month:", month)
        for page in range(100):
            url = geophysics_template.format(
                year=year,
                month=month,
                start_page=page)
            html, fingerprint = fetch_geophysics_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            if len(fingerprint) == 0:
                # page already seen, move to the next month
                print("done on page", page)
                break
            previous_fingerprint = fingerprint

            filename = "GEOPHYSICS_{year}_{month}_{page}.html".format(year=year, month=month, page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)
                

scraping year: 2021
scraping month: 01
done on page 3


#### GeoScienceWorld journal: BSSA and SRL

In [38]:
# GeoScienceWorld journal values

gsw_template = "https://pubs.geoscienceworld.org/search-results?page={page_number}&f_JournalDisplayName={journal_name}&fl_ContentType=Journal+Article+OR+Journal+OR+Book+OR+Book+Chapter+OR+GeoRef+Record&fl_JournalID={publication}&rg_PublicationDate={month}%2f{day}%2f{year}+TO+{month}%2f{last_day}%2f{year}&restypeid=3&f_ArticleTypeDisplayName=Research+Article" 

gsw_journal_values = [
    ["Bulletin+of+the+Seismological+Society+of+America","66"],
    ["Seismological+Research+Letters","65"]
]

In [39]:
# Setup to scrape GSW pages

def fetch_gsw_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("al-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = gsw_template.format(
                    journal_name="Seismological+Research+Letters",
                    publication="65",
                    year="2016",
                    month="03",
                    day="01",
                    last_day="31",
                    page_number=1)
html, fingerprint = fetch_gsw_page(browser, url)

with codecs.open("test/test_gsw_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    



In [40]:
# Scrape GeoScienceWorld
previous_fingerprint = ""

for journal_name, journal_id in gsw_journal_values:
    print("scraping journal:", journal_name)
    for year in years:
        print("scraping year:", year)
        for i, month in enumerate(months):
            print("scraping month:", month)
            last_day = days_in_month[i]
            for page in range(1,101):
                url = gsw_template.format(
                    journal_name=journal_name,
                    publication=journal_id,
                    year=year,
                    month=month,
                    day="01",
                    last_day=last_day,
                    page_number=page)
                html, fingerprint = fetch_gsw_page(browser, url)
                if len(html) == 0:
                    print("nothing to save for", url)
                    continue
                if fingerprint == previous_fingerprint:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                if len(fingerprint) == 0:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                previous_fingerprint = fingerprint

                filename = "{name}_{year}_{month}_{page}.html".format(name=journal_name, year=year, month=month, page=page)
                with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                    outfile.write(html)

                time.sleep(2)
                

scraping journal: Bulletin+of+the+Seismological+Society+of+America
scraping year: 2021
scraping month: 01
done on page 3
scraping journal: Seismological+Research+Letters
scraping year: 2021
scraping month: 01
done on page 4


#### Nature geoscience

In [42]:
# setup to scrape nature geoscience

ngs_template = "https://www.nature.com/search?article_type=research%2Creviews&date_range={year}-{year}&journal=ngeo&subject=seismology&order=relevance&page={page}" 


def fetch_ngs_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("h3")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = ngs_template.format(year="2015", page=1)
html, fingerprint = fetch_ngs_page(browser, url)

with codecs.open("test/test_ngs_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
# print("fingerprint:", fingerprint)


In [43]:
# scrape nature geoscience

previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = ngs_template.format(year=year, page=page)
        html, fingerprint = fetch_ngs_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="NatureGeoscience", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


scraping year: 2021
done on page 3


#### Nature

In [44]:
# setup to scrape nature 

nature_template = "https://www.nature.com/search?article_type=research%2Creviews&date_range={year}-{year}&journal=nature&subject=seismology&order=relevance&page={page}" 


def fetch_nature_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("h3")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = nature_template.format(year="2015", page=1)
html, fingerprint = fetch_nature_page(browser, url)

with codecs.open("test/test_nature_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
# print("fingerprint:", fingerprint)


fingerprint: Scale dependence of rock friction at high work rate Lithospheric controls on magma composition along Earth’s longest continental hotspot track Broad plumes rooted at the base of the Earth's mantle beneath major hotspots Western US intermountain seismicity caused by changes in upper mantle flow The role of ridges in the formation and longevity of flat slabs A seismic reflection image for the base of a tectonic plate Seismic evidence of effects of water on melt transport in the Lau back-arc mantle


In [45]:
# scrape nature 

previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = nature_template.format(year=year, page=page)
        html, fingerprint = fetch_nature_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="Nature", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


scraping year: 2021
done on page 3


#### Solid Earth

In [47]:
se_template = "https://editor.copernicus.org/search.php?abstract=earthquake+seism&startYear={year}&endYear={year}&paperVersion=final&journal=431&page={page}"

def fetch_se_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("article-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = se_template.format(year="2015", page=2)
html, fingerprint = fetch_se_page(browser, url)

with codecs.open("test/test_se_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
# print("fingerprint:", fingerprint)

In [48]:
# scrape Solid Earth 

previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = se_template.format(year=year, page=page)
        html, fingerprint = fetch_se_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="SolidEarth", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


scraping year: 2021
done on page 5


#### Science AAAS

In [49]:
science_template = "http://classic.sciencemag.org/search?journalcode=sci&volume=&firstpage=&submit=yes&doi=&submit=yes&fulltext=&andorexactfulltext=or&titleabstract=seism%2C+earthquake&andorexacttitleabs=or&title=&andorexacttitle=and&author1=&author2=&fmonth=Jan&fyear={year}&tmonth=Dec&tyear={year}&hits=125&sortspec=date&submit=yes&resourcetype=HWCIT&tocsectionid=Original+Research&submit=yes&submit.x=46&submit.y=8"

In [51]:
def fetch_science_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    time.sleep(10)
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("cit-first-element")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = science_template.format(year="2018")
html, fingerprint = fetch_science_page(browser, url)

with codecs.open("test/test_science_page.html", "w", "utf8") as outfile:
    outfile.write(html)
# print("fingerprint:", fingerprint)

In [52]:
print(url)

http://classic.sciencemag.org/search?journalcode=sci&volume=&firstpage=&submit=yes&doi=&submit=yes&fulltext=&andorexactfulltext=or&titleabstract=seism%2C+earthquake&andorexacttitleabs=or&title=&andorexacttitle=and&author1=&author2=&fmonth=Jan&fyear=2018&tmonth=Dec&tyear=2018&hits=125&sortspec=date&submit=yes&resourcetype=HWCIT&tocsectionid=Original+Research&submit=yes&submit.x=46&submit.y=8


In [None]:
# scrape Science and Science Advances 

for year in years:
    print("scraping year:", year)
    url = science_template.format(year=year)
    html, fingerprint = fetch_science_page(browser, url)
    if len(html) == 0:
        print("nothing to save for", url)
        continue
    if len(fingerprint) == 0:
        print("no articles found")
        break
    filename = "{name}_{year}_{month}_{page}.html".format(name="Science", year=year, month="0", page="1")
    with codecs.open("pages/"+filename, "w", "utf8") as outfile:
        outfile.write(html)

    time.sleep(2)


#### Science Direct

In [54]:
# setup for science direct scraping

# note: offset is (page-1)*100
science_direct_template = "https://www.sciencedirect.com/search?pub={journal}&cid={journal_id}&date={year}&articleTypes=REV%2CFLA&show=100&tak=seismo%20OR%20seismi%20OR%20earthquake&sortBy=relevance&offset={offset}"
sd_journal_values = [
    ["Tectonophysics", "271882"],
    ["Physics+of+the+Earth+and+Planetary+Interiors", "271835"],
    ["Earth+and+Planetary+Science+Letters", "271830"]
    
]

def fetch_sd_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    time.sleep(5)
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("result-list-title-link")
    #print(elements)
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

journal, journal_id = sd_journal_values[0]
url = science_direct_template.format(journal=journal, journal_id=journal_id, year="2011", offset=0)
html, fingerprint = fetch_sd_page(browser, url)

with codecs.open("test/test_sd_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    
# print("fingerprint:", fingerprint)

In [55]:
# scrape science direct

previous_fingerprint = ""

for journal, journal_id in sd_journal_values:
    for year in years:
        print("scraping year:", year)
        for page in range(1,101):
            url = science_direct_template.format(journal=journal, journal_id=journal_id, year=year, offset=((page-1)*100))
            html, fingerprint = fetch_sd_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            
            previous_fingerprint = fingerprint

            filename = "{name}_{year}_{month}_{page}.html".format(name=journal, year=year, month="0", page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)

scraping year: 2021
done on page 3
scraping year: 2021
done on page 3
scraping year: 2021
done on page 1
