In [1]:
# import dependencies
import pandas as pd 
import time
import pymongo
import re
import json
import requests
from  tqdm import tqdm
from pprint import pprint
from pathlib import Path
from bs4 import BeautifulSoup


#import local libraries
import functions_mongo as mdb
import funtions_webscraping as web

In [2]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#create database
db = client.visualizing_sep

#create collection to hold SEP TOC
toc_collection = db.sep_win2019_toc
# entries_collection = db.all_entries
entries_collection = db.all_entries_updated



In [None]:
### SCRAPE TOC FROM SEP AND SAVE TO MONGO ###

print("Starting: SEP TOC Scrape\n")

#TOC to scrape
search_url = "https://plato.stanford.edu/archives/win2019/contents.html"

#url to generate absolute paths from
base_url = "https://plato.stanford.edu/archives/win2019/"

#current TOC edition
toc_edition = "Winter 2019"

#hold all individual TOC entries 
toc_list = []

#scrape selected webpage and save HTML file
soup = web.scrape_web_page(search_url, "html_files/win2019_toc.html")

#get all links with an HREF attribute in the content page
links_in_toc = soup.find(id="content").findAll("a", href=True)

#we will write to these two texts files during our TOC scrape. 
#file_tocList logs the object names and links to be stored in mongo
#file_links logs each link that BeautifulSoup identified as valid
file_toclist = open("html_files/sep_toc_list.txt","w",encoding="UTF-8")
file_links = open("html_files/sep_toc_links.txt", "w", encoding="UTF-8")

for link in links_in_toc:
    link_text = link.get_text()
    link_href = link["href"]

    #some links are internal links to other parts of the TOC page, so we exclude those from scraping
    if link_href != '#pagetopright':
        #create dictionary to append to mongo
        entry = {"base_url":base_url, "link_url":link_href}

        #some entries are duplicated in the TOC, so we only store unique links.
        if entry not in toc_list:
            toc_list.append(entry)
            #log results
            file_toclist.write(f"{link_text}:{link_href}\n")
            file_links.write(f"{link}\n")

# #insert into Mongo
toc_collection.insert_many(toc_list)


print("Completed: SEP TOC Scrape\n")


In [None]:
### DOWNLOAD ALL PAGES FROM SEP, AND STORE THEM LOCALLY###
# Completed 1/30/2020. This took 7 hours, so don't do it again!

#get all page links stored in mongo
pages = toc_collection.find().sort('link_url')

#cast mongo cursor into python list for iteration.
pages_list = list(pages)

#download all pages from TOC
for page in tqdm(pages_list, desc="Processing"):

    base_url = page['base_url']
    link_url = page['link_url']

    #absolute path to the file to download from SEP
    page_url = f"{base_url}{link_url}"
    
    #link_urls are all stored with the following structure: "entries/<pagename>.html"
    #We want to remove "entires/" remove all paths from page name, and 
    # store the file locally with just the individual page name
    slash_pos = link_url.rfind("/",0,len(link_url)-1)
    page_name = link_url[slash_pos+1:len(link_url)-1]

    #local path to save file into 
    html_save_as = f"html_files/sep/{page_name}.html"

    try:
        #this is commented out, so that this notebook cell isn't accidently activated 
        #and local files overwritten
        #soup = web.scrape_web_page(page_url, html_save_as)
        
    except:
        print(f"Scraped failed @ {page_name}")





In [18]:
### PARSING LOCAL SEP FILES ###

sep_searchpath = Path.cwd() / 'html_files/sep'
sep_files = list(Path(sep_searchpath).rglob('*.html'))
sep_files = sep_files[0:3]

#list to hold all page objects
page_data = []

for sep_file in tqdm(sep_files, "Processing:"):
    file_name = sep_file.name
    current_page = "/entries/" + file_name.replace(".html","").strip() + "/"
    file_to_read = open(str(sep_file),'r', encoding='UTF-8').read()
    
    soup = BeautifulSoup(file_to_read, 'lxml')

    title = soup.find(id="aueditable").find("h1").get_text()
    pubdate = soup.find(id="pubinfo").get_text()
    inpho = soup.find(href=re.compile('^https://www.inphoproject.org/'))["href"]
    related_entries = soup.find(id="related-entries").find_all("a", href=True)
    copyright_html = soup.find(id="article-copyright").find("p")

    first_paragraph = soup.find(id="preamble").find("p").get_text()
    first_paragraph = first_paragraph.replace('\n',' ').replace('\r', ' ')

    #list to hold related links from page
    related_pages = []
    for entry in related_entries:
        sep_link = entry["href"].replace("..","/entries").strip()
        sep_text = entry.get_text()
        rel = { "link": sep_link, "text": sep_text}
        
        related_pages.append(rel)


    page_object = { "page_url": current_page,
                    "title": title,
                    "pubdate": pubdate,
                    "inpho_link":inpho,
                    "related":related_pages,
                    "copyright":copyright_html,
                    "first_paragraph":first_paragraph}
    
    page_data.append(page_object)

#this line got deleted somehow, so it must be tested before its run again
# entries_collection.insert_many(page_data)
pprint(page_data)




    

Processing:: 100%|██████████| 3/3 [00:00<00:00, 23.87it/s]
[{'copyright':<p>
<a href="../../info.html#c">Copyright © 2014</a> by

<br/>
Brigitte Sassen
&lt;<a href="mailto:sassenb%40mcmaster%2eca"><em>sassenb<abbr title=" at ">@</abbr>mcmaster<abbr title=" dot ">.</abbr>ca</em></a>&gt;
    </p>,
'first_paragraph':'  In Germany, the eighteenth century was the age of '
'enlightenment, the age, that is, that called for the '
'independence of reason. Although the ethos of this age '
'found its clearest (and certainly its most famous) '
'articulation towards the end of the century with '
'Immanuel Kant and his critical philosophy, he was not '
'the first to issue this call. Instead, that task fell to '
'Christian Thomasius (Thomas) at the end of the '
'seventeenth century. It was then taken up and further '
'developed in a theological (pietist) direction by a '
'number of minor figures, the Thomasians, and reissued in '
'a rationalist direction in the early and middle part of '
'the eightee

In [16]:
### get InPhO API URL from the SEP InPhO URL field ###
# Completed 2/12/2020. 

#get all entries stored in mongo
sep_entries = list(entries_collection.find().sort('page_url'))

#loop through sep_entries
for entry in tqdm(sep_entries, desc="Processing:"):
    #get mongoDB ID
    entry_id = entry["_id"]
    #get InPhO URL from entry 
    inpho_url = entry["inpho_link"]
    #open request from InPhO URL and convert mongo URL into proper API call
    r = requests.get(inpho_url)
    api_url = r.url
    api_url = api_url.replace("https://www.inphoproject.org/",\
                              "https://inpho.cogs.indiana.edu/")\
                     .replace("html","json")
    #update mongo
    entries_collection.update_one({"_id":entry_id},{"$set": {"inpho_api": api_url}})


Processing:: 100%|██████████| 1671/1671 [12:17<00:00,  2.27it/s]


In [21]:
### Update all Taxonomy InPhO API links with proper JSON formatting ###
# Completed 2/12/2020. 

#find all InPhO API URLS that are a taxonomy
taxonomy_entries = list(entries_collection.find({'inpho_api': {"$regex":'taxo'}}))

#loop through entries
for entry in tqdm(taxonomy_entries, desc="Processing:"):
    #get mongoDB ID
    entry_id = entry["_id"]
    #add .json file handler to each entry
    inpho_api = f"{entry['inpho_api']}.json"
    #update Mongo
    entries_collection.update_one({"_id":entry_id},{"$set": {"inpho_api": inpho_api}})

Processing:: 100%|██████████| 95/95 [00:00<00:00, 1642.07it/s]


In [24]:
taxonomy_entries = list(entries_collection.find({'inpho_api': {"$regex":'taxo'}}))
for entry in taxonomy_entries:
    print(entry["inpho_api"])

https://inpho.cogs.indiana.edu/taxonomy/2355.json
https://inpho.cogs.indiana.edu/taxonomy/2265.json
https://inpho.cogs.indiana.edu/taxonomy/2444.json
https://inpho.cogs.indiana.edu/taxonomy/2203.json
https://inpho.cogs.indiana.edu/taxonomy/2398.json
https://inpho.cogs.indiana.edu/taxonomy/2392.json
https://inpho.cogs.indiana.edu/taxonomy/2221.json
https://inpho.cogs.indiana.edu/taxonomy/2302.json
https://inpho.cogs.indiana.edu/taxonomy/2372.json
https://inpho.cogs.indiana.edu/taxonomy/2296.json
https://inpho.cogs.indiana.edu/taxonomy/2284.json
https://inpho.cogs.indiana.edu/taxonomy/2212.json
https://inpho.cogs.indiana.edu/taxonomy/2241.json
https://inpho.cogs.indiana.edu/taxonomy/2417.json
https://inpho.cogs.indiana.edu/taxonomy/2301.json
https://inpho.cogs.indiana.edu/taxonomy/2207.json
https://inpho.cogs.indiana.edu/taxonomy/2185.json
https://inpho.cogs.indiana.edu/taxonomy/2434.json
https://inpho.cogs.indiana.edu/taxonomy/2341.json
https://inpho.cogs.indiana.edu/taxonomy/2292.json


In [27]:
### CREATE JSON FROM MONGODB ###
# First Completed 2/10/2020. 
# This code has been updated to get the number of links and the type of article

#get all entries stored in mongo
sep_entries = list(entries_collection.find().sort('title'))

#init empty lists 
nodes_list = []
links_list = []

#loop through all entries
for entry in tqdm(sep_entries, desc="Processing"):
    if "thinker" in entry["inpho_api"]:
        entry_type = "thinker"
    else:
        entry_type = "idea"

    node_object = { 
        "id": entry["page_url"], 
        "title": entry["title"],
        "num_links": len(entry["related"]),
        "entry_type": entry_type
        }

    nodes_list.append(node_object)

    for link in entry["related"]:
        doc = entries_collection.find({"page_"})
        links_list.append({"source":entry["page_url"], "target":link["link"]})

network_object = {"nodes": nodes_list, "links":links_list}

with open('network1.json', 'w', encoding='UTF-8') as f:
    json.dump(network_object,f,ensure_ascii=False, indent=4)

Processing: 100%|██████████| 1671/1671 [00:00<00:00, 186490.39it/s]


In [71]:
### CREATE JSON of thinkers FROM MONGODB ###
# First Completed 2/10/2020. 
# This code has been updated to get the number of links and the type of article

#get all entries stored in mongo
sep_entries = list(entries_collection.find().sort('title'))

#init empty lists 
nodes_list = []
links_list = []

#loop through all entries
for entry in tqdm(sep_entries, desc="Processing"):
    if "thinker" in entry["inpho_api"]:

        node_object = { 
            "id": entry["page_url"], 
            "title": entry["title"],
            "num_links": len(entry["related"]),
            "entry_type": entry["inpho_api"]
            }

        nodes_list.append(node_object)

        for link in entry["related"]:
            if entries_collection.find_one({ "$and": [{'page_url':link["link"]},
                                       {'inpho_api': {'$regex':'thinker'}}]}):  
                links_list.append({"source":entry["page_url"], "target":link["link"]})

network_object = {"nodes": nodes_list, "links":links_list}

with open('network_thinkers.json', 'w', encoding='UTF-8') as f:
    json.dump(network_object,f,ensure_ascii=False, indent=4)

Processing: 100%|██████████| 1671/1671 [00:06<00:00, 263.81it/s]


In [68]:
if entries_collection.find_one({ "$and": [{'page_url':'/entries/bodin/'},
                                       {'inpho_api': {'$regex':'thinker'}}]}):
    print("yes")

yes
