In [1]:
### THIS SCRIPT COLLECTS DATA FROM SEP and InPhO ### 

In [1]:
# import dependencies
import time
import pymongo
import re
import json
import requests

from tqdm import tqdm
from pprint import pprint
from pathlib import Path
from bs4 import BeautifulSoup

#import local libraries
import funtions_webscraping as web

In [2]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#connect to database
db = client.visualizing_sep

In [31]:
def scrape_sep_toc(sep_edition, search_url, base_url, save_as_html):
    """ Downloads the SEP TOC (or Whats New) page for each SEP edition.

        Keyword Arguments:
        sep_edition -- Quarater and Year for each SEP archive
        search_url -- URL for the specific TOC file to download
        base_url -- base URL for all the articles in the current archive
        save_as_html -- Local file path and name to save the downloaded file to.

        Returns:
        List of unique files in the TOC
     """

    #list to hold all unique TOC links
    toc_unique_hrefs = []

    #scrape TOC and save as local HTML file

    soup = web.scrape_web_page(search_url, save_as_html)

    #get only links to articles in the SEP (ie, only those hrefs that begin with 'entries/') 
    links_in_toc = soup.findAll(href=re.compile('^entries/'))

    #add only unique hrefs to toc_unique_hrefs list
    for link in links_in_toc:
        article_url = { 'link_url': "/" + link['href'],
                        'base_url': base_url }
                        
        if article_url not in toc_unique_hrefs:
            toc_unique_hrefs.append(article_url)

    return toc_unique_hrefs

In [36]:
### SCRAPE TOC FROM SEP AND SAVE TO MONGO ###
### Downloaded Winter 2019 edition: Beginning of January, then again on 3.30.2020 ###

#variables for this download
toc_collection = db.toc_2019_winter_all
sep_edition = 'Winter_2019'
search_url = 'https://plato.stanford.edu/archives/win2019/contents.html'
base_url = 'https://plato.stanford.edu/archives/win2019/'
save_as_html = 'data_collection/toc_2019_winter_all.html'

print(f'Starting: SEP {sep_edition} TOC Scrape\n')

toc_elements = scrape_sep_toc(sep_edition, search_url, base_url, save_as_html)
toc_collection.insert_many(toc_elements)

print(f'Success! Completed SEP {sep_edition} TOC Scrape\n')

Starting: SEP Winter_2019 TOC Scrape

data_collection/toc_2019_winter_all.html
Success! Completed SEP Winter_2019 TOC Scrape



In [37]:
### SCRAPE TOC FROM SEP AND SAVE TO MONGO ###
### Downloaded Spring 2020 updates: completed 4.10.2020 ###

#variables for this download
toc_collection = db.toc_2020_spring_new
sep_edition = 'Spring_2020'
search_url = 'https://plato.stanford.edu/archives/spr2020/new.html'
base_url = 'https://plato.stanford.edu/archives/spr2020/'
save_as_html = 'data_collection/toc_2020_spring_new.html'

print(f'Starting: SEP {sep_edition} TOC Scrape\n')

toc_elements = scrape_sep_toc(sep_edition, search_url, base_url, save_as_html)
toc_collection.insert_many(toc_elements)

print(f'Success! Completed SEP {sep_edition} TOC Scrape\n')




Starting: SEP Spring_2020 TOC Scrape

data_collection/toc_2020_spring_new.html
Success! Completed SEP Spring_2020 TOC Scrape



In [8]:
def download_sep_pages(toc_collection, save_as_directory):
    """ Downloads each SEP page listed in a SE archive.

        Keyword Arguments:
        toc_collection -- MongoDB collection for specific archive to download files from
        save_as_directory -- Directory to save each file into 

        Returns:
        Nothing. The function only downloads identified pages
     """    
    
    #list of pages to download
    pages = list(toc_collection.find().sort('link_url'))

    #loop where pages are rendered from
    for page in tqdm(pages, desc="Processing"):

        #get baseurl and linkurl for each entry, then create absolute path reference to SEP file
        base_url = page['base_url']
        link_url = page['link_url']
        page_url = f"{base_url}{link_url}"
        
        #link_urls are all stored with the following structure: "entries/<pagename>.html"
        #We want to remove the "entries/" page from the link_url and 
        # store the file locally with just the individual page name
        slash_pos = link_url.rfind("/",0,len(link_url)-1)
        page_name = link_url[slash_pos+1:len(link_url)-1]

        html_save_as = f"{save_as_directory}{page_name}.html"

        web.scrape_web_page(page_url, html_save_as)


In [None]:
### DOWNLOAD ALL PAGES FROM SEP, AND STORE THEM LOCALLY###
# Winter 2019 Archive: Completed 1/30/2020. This took 7 hours, so don't do it again!
# I've updated the code on 4.10.2020 to make it cleaner, but the files were NOT re-downloaded

toc_collection = db.toc_2019_winter_all
save_as_directory = 'data_collection/html_files/sep_2019_all/'

#download_sep_pages(toc_collection,save_as_directory)

In [9]:
#Download the latest SEP pages from the Spring 2020 archive

toc_collection = db.toc_2020_spring_new
save_as_directory = 'data_collection/html_files/sep_2020_spring/'

download_sep_pages(toc_collection,save_as_directory)



Processing:   0%|          | 0/67 [00:00<?, ?it/s]data_collection/html_files/sep_2020_spring/abraham-daud.html


Processing:   1%|▏         | 1/67 [00:10<11:57, 10.87s/it]data_collection/html_files/sep_2020_spring/al-kindi.html


Processing:   3%|▎         | 2/67 [00:20<11:27, 10.58s/it]data_collection/html_files/sep_2020_spring/albert-great.html


Processing:   4%|▍         | 3/67 [00:30<11:01, 10.34s/it]data_collection/html_files/sep_2020_spring/albo-joseph.html


Processing:   6%|▌         | 4/67 [00:55<15:29, 14.76s/it]data_collection/html_files/sep_2020_spring/altruism-empirical.html


Processing:   7%|▋         | 5/67 [01:05<13:47, 13.35s/it]data_collection/html_files/sep_2020_spring/arabic-islamic-influence.html


Processing:   9%|▉         | 6/67 [01:16<12:51, 12.65s/it]data_collection/html_files/sep_2020_spring/authenticity.html


Processing:  10%|█         | 7/67 [01:29<12:45, 12.75s/it]data_collection/html_files/sep_2020_spring/biology-developmental.html


Processing:  12%

In [3]:
### FUNCTIONS FOR PARSING THE LOCAL SEP FILES ### 

def return_jsondata(url_to_json):
    """ returns the properly formatted json endpoint and the remote JSON data, for url_to_json """

    # get API URL in proper format
    inpho_api = requests.get(url_to_json).url

    #replace URL and HTML designations with appropriate end point and .json designations
    inpho_api = inpho_api.replace("https://www.inphoproject.org/",\
                                  "http://inpho.cogs.indiana.edu/")\
                         .replace("html","json")

    #taxonomy api urls don't include the '.json' file type reference (but don't know why), so we have to add the file type for these URLS.
    if (inpho_api.find("taxonomy")) != -1:
        inpho_api = inpho_api + ".json"
    
    #not every SEP entry has an InPhO entry. test for this, and return "Error: No InPhO entry"
    try:
        requests_JSON = requests.get(inpho_api)
        inpho_json = json.loads(requests_JSON.text)

    except:
        inpho_api = "Error: No InPhO entry"
        inpho_json = "Error: No InPhO entry"

    return inpho_api,inpho_json

def process_links(links_in_page):
    """ returns a unique list of all outgoing links listed on the page """

    unique_links = []
    
    #links within articles take the form of '../<page>', but they have to be transformed 
    #into the form of "/entries/<page>/" for the network graphs
    for link in links_in_page:
        link_url = link["href"].replace("..","/entries").strip()
        last_slash_pos = link_url.find("/",9) + 1
        #/entries/<page>/
        link_url = link_url[0:last_slash_pos]
        link_text = link.get_text()
        outgoing_link = { "link": link_url, "text": link_text}

        if outgoing_link not in unique_links:
            unique_links.append(outgoing_link)
    
    return unique_links

def parse_sep(file_to_parse):
    """ Parse file_to_parse for the necessary information """

    #we need to create the current pageurl in the form needed for the network graph arrays: "/entries/<page>"
    file_name = file_to_parse.name
    page_url = "/entries/" + file_name.replace(".html","").strip() + "/"

    #open file to scrape
    file_to_read = open(str(file_to_parse),'r', encoding='UTF-8').read()
    soup = BeautifulSoup(file_to_read, 'lxml')

    #get specific page properties
    title = soup.find(id="aueditable").find("h1").get_text()
    pubdate = soup.find(id="pubinfo").get_text()
    copyright = str(soup.find(id="article-copyright").find("p"))
    preamble = soup.find(id="preamble")
    first_paragraph = preamble.find("p").get_text()
    first_paragraph = first_paragraph.replace('\n',' ')
    preample_text = preamble.get_text()
    main_text = soup.find(id="main-text").get_text()
    page_text = preample_text + " " + main_text

    #get InPhO Href from page, and then transform it into the needed API format url, and then get the JSON data
    inpho_href = soup.find(href=re.compile('^https://www.inphoproject.org/'))["href"]
    inpho_api, inpho_JSON = return_jsondata(inpho_href)

    #get all outgoing links                    
    article_links = soup.find(id="aueditable").findAll(href=re.compile('^../'))

    #list to hold related links from page
    outgoing_links = process_links(article_links)

    page_object = { "page_url": page_url,
                    "title": title,
                    "pubdate": pubdate,
                    "copyright":copyright,
                    "first_paragraph":first_paragraph,
                    "pagetext": page_text,
                    "inpho_href":inpho_href,
                    "inpho_api":inpho_api,
                    "inpho_json":inpho_JSON,
                    "outlinks":outgoing_links }

    return page_object


In [4]:
### PARSING LOCAL SEP FILES ###
### This code reads through the SEP files that were downloaded, parses them with BeautifulSoup to pull out selected information, and then stores that parsed information in a MongoDB collection.
### Date(s) completed: 3.30.2020(1012 records) and 3.31.2020(659)

#set local search path
sep_searchpath = Path.cwd() / 'data_collection/html_files/sep'

#get list of all files to search through
sep_files = list(Path(sep_searchpath).rglob('*.html'))

# #set a list subset, for debugging 
# sep_files = sep_files[0:2] 

#loop through all the html files, parse each one, and then load the parsed data into MongoDB
for sep_file in tqdm(sep_files):
    sep_object = parse_sep(sep_file)
    entries_collection.insert_one(sep_object)
    time.sleep(2)



100%|██████████| 659/659 [34:49<00:00,  3.17s/it]
