In [2]:
### THIS SCRIPT COLLECTS DATA FROM SEP and InPhO ### 

In [3]:
# import dependencies
import time
import pymongo
import re
import json
import requests

from tqdm import tqdm
from pprint import pprint
from pathlib import Path
from bs4 import BeautifulSoup

#import local libraries
import lib_sepinpho as sep
import lib_fileops as io

In [4]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#connect to database
db = client.visualizing_sep

In [None]:
### SCRAPE TOC FROM WINTER 2019 SEP ARCHIVE ###
### Date Completed: 3.30.2020 ###

#set variable options for Winter 2019 download
toc_collection = db.toc_2019_winter_all
sep_edition = 'Winter_2019'
search_url = 'https://plato.stanford.edu/archives/win2019/contents.html'
base_url = 'https://plato.stanford.edu/archives/win2019/'
save_as_html = 'data_collection/toc_2019_winter_all.html'

print(f'Starting: SEP {sep_edition} TOC Scrape\n')

#get list of elements to save into toc_collection 
toc_elements = scrape_sep_toc(sep_edition, search_url, base_url, save_as_html)

#update MonogDB collection
toc_collection.insert_many(toc_elements)

print(f'Success! Completed SEP {sep_edition} TOC Scrape\n')

In [5]:
### SCRAPE WHATS NEW FROM SPRING 2020 SEP ARCHIVE ###
### Downloaded Spring 2020 updates: completed 4.10.2020 ###

#set variable options for Spring 2020 download
toc_collection = db.toc_2020_spring_new
sep_edition = 'Spring_2020'
search_url = 'https://plato.stanford.edu/archives/spr2020/new.html'
base_url = 'https://plato.stanford.edu/archives/spr2020/'
save_as_html = 'data_collection/toc_2020_spring_new.html'

print(f'Starting: SEP {sep_edition} TOC Scrape\n')

toc_elements = sep.scrape_sep_toc(sep_edition, search_url, base_url, save_as_html)
toc_collection.insert_many(toc_elements)

print(f'Success! Completed SEP {sep_edition} TOC Scrape\n')


Starting: SEP Spring_2020 TOC Scrape

Success! Completed SEP Spring_2020 TOC Scrape



In [6]:
### DOWNLOAD WINTER 2019 SEP ARCHIVE, AND STORE THEM LOCALLY###
# Winter 2019 Archive: Completed 1/30/2020. This took 7 hours, so don't do it again!
# Updated the code on 4.10.2020 to make the download process cleaner, but the files were NOT re-downloaded

#set variable options for Winter 2019 download
toc_collection = db.toc_2019_winter_all
toc_pages = list(toc_collection.find().sort('link_url'))
save_as_directory = 'data_collection/html_files/sep_2019_all/'

#download SEP pages from archive
#sep.download_sep_pages(toc_pages,save_as_directory)

In [6]:
### DOWNLOAD SPRING 2020 WHATS NEW FILES, AND STORE THEM LOCALLY###
#Spring 2020 Archive: Completed 4.10.2020#

#set variable options for Spring 2020 download
toc_collection = db.toc_2020_spring_new
toc_pages = list(toc_collection.find().sort('link_url'))
save_as_directory = 'data_collection/html_files/sep_2020_spring/'

#download SEP pages from archive
sep.download_sep_pages(toc_pages,save_as_directory)

Processing: 100%|██████████| 67/67 [14:33<00:00, 13.04s/it]


In [4]:
### PARSING LOCAL SEP FILES ###

### After downloading the Winter 2019 archive and the Spring 2020 updates, I moved the updates into the same folder, and then ran the parsing script on all of the files at once. This replaced the prior verion of the parsed data objects in MongoDB###

#set variable options for parsing
collection_to_update = db.sep_data
file_path='data_collection/html_files/sep_2019_all'
file_type='*.html'
sep_files = io.get_local_files(file_path,file_type)
for sep_file in tqdm(sep_files):
    try:
        sep_object = sep.parse_sep_file(sep_file)
        collection_to_update.insert_one(sep_object)
    except:
        print(f'Failed at: @{sep_file.name}')
        
    time.sleep(3)



ea/5381.json
  7%|▋         | 15/226 [01:03<14:13,  4.05s/it]http://inpho.cogs.indiana.edu/thinker/3887.json
  7%|▋         | 16/226 [01:06<14:02,  4.01s/it]http://inpho.cogs.indiana.edu/thinker/3897.json
  8%|▊         | 17/226 [01:10<13:59,  4.02s/it]http://inpho.cogs.indiana.edu/thinker/3237.json
  8%|▊         | 18/226 [01:15<13:56,  4.02s/it]http://inpho.cogs.indiana.edu/thinker/3903.json
  8%|▊         | 19/226 [01:18<13:48,  4.00s/it]http://inpho.cogs.indiana.edu/thinker/4101.json
  9%|▉         | 20/226 [01:22<13:38,  3.98s/it]http://inpho.cogs.indiana.edu/idea/1257.json
  9%|▉         | 21/226 [01:27<13:54,  4.07s/it]http://inpho.cogs.indiana.edu/idea/5541.json
 10%|▉         | 22/226 [01:31<13:47,  4.06s/it]http://inpho.cogs.indiana.edu/idea/1256.json
 10%|█         | 23/226 [01:35<14:27,  4.27s/it]http://inpho.cogs.indiana.edu/idea/2172.json
 11%|█         | 24/226 [01:40<14:34,  4.33s/it]http://inpho.cogs.indiana.edu/idea/5550.json
 11%|█         | 25/226 [01:45<15:09,  4.5

In [5]:
db.sep_data.create_index('title')

'title_1'