In [1]:
### THIS SCRIPT COLLECTS DATA FROM SEP and InPhO ### 


In [None]:

# import dependencies
import time
import pymongo
import re
import json
import requests

from  tqdm import tqdm
from pprint import pprint
from pathlib import Path
from bs4 import BeautifulSoup

#import local libraries
import functions_mongo as mdb
import funtions_webscraping as web

In [2]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#create database
db = client.visualizing_sep

#collection to hold SEP TOC
toc_collection = db.sep_toc_win2019

#collection to hold entry objects
entries_collection = db.sep_entries

In [41]:
### SCRAPE TOC FROM SEP AND SAVE TO MONGO ###
### First version of code: completed 1.26.2020 ###
### Updated code (same results, different architecture): completed 3.30.2020 ###

print('Starting: SEP TOC Scrape\n')


#current TOC edition
toc_edition = 'Winter 2019'

#TOC to scrape
search_url = 'https://plato.stanford.edu/archives/win2019/contents.html'

#Base_URL for current edition archive
base_url = 'https://plato.stanford.edu/archives/win2019/'

#list to hold all unique TOC links
toc_unique_hrefs = []

#scrape selected webpage and save TOC as local HTML file
soup = web.scrape_web_page(search_url, 'data_collection/win2019_toc.html')

#get only links to articles in the SEP (ie, only those hrefs that begin with 'entries/') 
links_in_toc = soup.findAll(href=re.compile('^entries/'))

#add only unique hrefs to toc_unique_hrefs list
for link in links_in_toc:
    article_url = { 'link_url': "/" + link['href'],
                    'base_url': base_url }
                    
    if article_url not in toc_unique_hrefs:
        toc_unique_hrefs.append(article_url)

#insert into Mongo
toc_collection.insert_many(toc_unique_hrefs)


print('Completed: SEP TOC Scrape\n')


Starting: SEP TOC Scrape

data_collection/win2019_toc.html
Completed: SEP TOC Scrape



In [None]:
### DOWNLOAD ALL PAGES FROM SEP, AND STORE THEM LOCALLY###
# Completed 1/30/2020. This took 7 hours, so don't do it again!

#get all page links stored in mongo
pages = toc_collection.find().sort('link_url')

#cast mongo cursor into python list for iteration.
pages_list = list(pages)

#download all pages from TOC
for page in tqdm(pages_list, desc="Processing"):

    base_url = page['base_url']
    link_url = page['link_url']

    #absolute path to the file to download from SEP
    page_url = f"{base_url}{link_url}"
    
    #link_urls are all stored with the following structure: "entries/<pagename>.html"
    #We want to remove "entires/" remove all paths from page name, and 
    # store the file locally with just the individual page name
    slash_pos = link_url.rfind("/",0,len(link_url)-1)
    page_name = link_url[slash_pos+1:len(link_url)-1]

    #local path to save file into 
    html_save_as = f"html_files/sep/{page_name}.html"

    try:
        #this is commented out, so that this notebook cell isn't accidently activated 
        #and local files overwritten
        #web.scrape_web_page(page_url, html_save_as)
        
    except:
        print(f"Scraped failed @ {page_name}")





In [3]:
### FUNCTIONS FOR PARSING THE LOCAL SEP FILES ### 

def return_jsondata(url_to_json):
    """ returns the properly formatted json endpoint and the remote JSON data, for url_to_json """

    # get API URL in proper format
    inpho_api = requests.get(url_to_json).url

    #replace URL and HTML designations with appropriate end point and .json designations
    inpho_api = inpho_api.replace("https://www.inphoproject.org/",\
                                  "http://inpho.cogs.indiana.edu/")\
                         .replace("html","json")

    #taxonomy api urls don't include the '.json' file type reference (but don't know why), so we have to add the file type for these URLS.
    if (inpho_api.find("taxonomy")) != -1:
        inpho_api = inpho_api + ".json"
    
    #not every SEP entry has an InPhO entry. test for this, and return "Error: No InPhO entry"
    try:
        requests_JSON = requests.get(inpho_api)
        inpho_json = json.loads(requests_JSON.text)

    except:
        inpho_api = "Error: No InPhO entry"
        inpho_json = "Error: No InPhO entry"

    return inpho_api,inpho_json

def process_links(links_in_page):
    """ returns a unique list of all outgoing links listed on the page """

    unique_links = []
    
    #links within articles take the form of '../<page>', but they have to be transformed 
    #into the form of "/entries/<page>/" for the network graphs
    for link in links_in_page:
        link_url = link["href"].replace("..","/entries").strip()
        last_slash_pos = link_url.find("/",9) + 1
        #/entries/<page>/
        link_url = link_url[0:last_slash_pos]
        link_text = link.get_text()
        outgoing_link = { "link": link_url, "text": link_text}

        if outgoing_link not in unique_links:
            unique_links.append(outgoing_link)
    
    return unique_links

def parse_sep(file_to_parse):
    """ Parse file_to_parse for the necessary information """

    #we need to create the current pageurl in the form needed for the network graph arrays: "/entries/<page>"
    file_name = file_to_parse.name
    page_url = "/entries/" + file_name.replace(".html","").strip() + "/"

    #open file to scrape
    file_to_read = open(str(file_to_parse),'r', encoding='UTF-8').read()
    soup = BeautifulSoup(file_to_read, 'lxml')

    #get specific page properties
    title = soup.find(id="aueditable").find("h1").get_text()
    pubdate = soup.find(id="pubinfo").get_text()
    copyright = str(soup.find(id="article-copyright").find("p"))
    preamble = soup.find(id="preamble")
    first_paragraph = preamble.find("p").get_text()
    first_paragraph = first_paragraph.replace('\n',' ')
    preample_text = preamble.get_text()
    main_text = soup.find(id="main-text").get_text()
    page_text = preample_text + " " + main_text

    #get InPhO Href from page, and then transform it into the needed API format url, and then get the JSON data
    inpho_href = soup.find(href=re.compile('^https://www.inphoproject.org/'))["href"]
    inpho_api, inpho_JSON = return_jsondata(inpho_href)

    #get all outgoing links                    
    article_links = soup.find(id="aueditable").findAll(href=re.compile('^../'))

    #list to hold related links from page
    outgoing_links = process_links(article_links)

    page_object = { "page_url": page_url,
                    "title": title,
                    "pubdate": pubdate,
                    "copyright":copyright,
                    "first_paragraph":first_paragraph,
                    "pagetext": page_text,
                    "inpho_href":inpho_href,
                    "inpho_api":inpho_api,
                    "inpho_json":inpho_JSON,
                    "outlinks":outgoing_links }

    return page_object


In [4]:
### PARSING LOCAL SEP FILES ###
### This code reads through the SEP files that were downloaded, parses them with BeautifulSoup to pull out selected information, and then stores that parsed information in a MongoDB collection.
### Date(s) completed: 3.30.2020(1012 records) and 3.31.2020(659)

#set local search path
sep_searchpath = Path.cwd() / 'data_collection/html_files/sep'

#get list of all files to search through
sep_files = list(Path(sep_searchpath).rglob('*.html'))

# #set a list subset, for debugging 
# sep_files = sep_files[0:2] 

#loop through all the html files, parse each one, and then load the parsed data into MongoDB
for sep_file in tqdm(sep_files):
    sep_object = parse_sep(sep_file)
    entries_collection.insert_one(sep_object)
    time.sleep(2)



100%|██████████| 659/659 [34:49<00:00,  3.17s/it]


Processing: 100%|██████████| 1671/1671 [00:00<00:00, 186490.39it/s]
