In [1]:
### THIS SCRIPT COLLECTS DATA FROM SEP and InPhO ### 

In [2]:
# import dependencies
import time
import pymongo
import re
import json
import requests

from tqdm import tqdm
from pprint import pprint
from pathlib import Path
from bs4 import BeautifulSoup

#import local libraries
import lib_sepinpho as sep
import lib_fileops as io

In [3]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#connect to database
db = client.visualizing_sep

In [None]:
### SCRAPE TOC FROM WINTER 2019 SEP ARCHIVE ###
### Date Completed: 1.31.2020, and then again on 3.30.2020 ###

#set variable options for Winter 2019 download
toc_collection = db.toc_2019_winter_all
sep_edition = 'Winter_2019'
search_url = 'https://plato.stanford.edu/archives/win2019/contents.html'
base_url = 'https://plato.stanford.edu/archives/win2019/'
save_as_html = 'data_collection/toc_2019_winter_all.html'

print(f'Starting: SEP {sep_edition} TOC Scrape\n')

#get list of elements to save into toc_collection 
toc_elements = scrape_sep_toc(sep_edition, search_url, base_url, save_as_html)

#update MonogDB collection
toc_collection.insert_many(toc_elements)

print(f'Success! Completed SEP {sep_edition} TOC Scrape\n')

In [5]:
### SCRAPE WHATS NEW FROM SPRING 2020 SEP ARCHIVE ###
### Downloaded Spring 2020 updates: completed 4.10.2020 ###

#set variable options for Spring 2020 download
toc_collection = db.toc_2020_spring_new
sep_edition = 'Spring_2020'
search_url = 'https://plato.stanford.edu/archives/spr2020/new.html'
base_url = 'https://plato.stanford.edu/archives/spr2020/'
save_as_html = 'data_collection/toc_2020_spring_new.html'

print(f'Starting: SEP {sep_edition} TOC Scrape\n')

toc_elements = sep.scrape_sep_toc(sep_edition, search_url, base_url, save_as_html)
toc_collection.insert_many(toc_elements)

print(f'Success! Completed SEP {sep_edition} TOC Scrape\n')


Starting: SEP Spring_2020 TOC Scrape

Success! Completed SEP Spring_2020 TOC Scrape



In [6]:
### DOWNLOAD WINTER 2019 SEP ARCHIVE, AND STORE THEM LOCALLY###
# Winter 2019 Archive: Completed 1/30/2020. This took 7 hours, so don't do it again!
# Updated the code on 4.10.2020 to make the download process cleaner, but the files were NOT re-downloaded

#set variable options for Winter 2019 download
toc_collection = db.toc_2019_winter_all
toc_pages = list(toc_collection.find().sort('link_url'))
save_as_directory = 'data_collection/html_files/sep_2019_all/'

#download SEP pages from archive
#sep.download_sep_pages(toc_pages,save_as_directory)

In [6]:
### DOWNLOAD SPRING 2020 WHATS NEW FILES, AND STORE THEM LOCALLY###
#Spring 2020 Archive: Completed 4.10.2020#

#set variable options for Spring 2020 download
toc_collection = db.toc_2020_spring_new
toc_pages = list(toc_collection.find().sort('link_url'))
save_as_directory = 'data_collection/html_files/sep_2020_spring/'

#download SEP pages from archive
sep.download_sep_pages(toc_pages,save_as_directory)

Processing: 100%|██████████| 67/67 [14:33<00:00, 13.04s/it]


In [None]:
### PARSING WINTER 2019 LOCAL SEP FILES ###
### Winter 2019 SEP Edition
### Date(s) completed: 3.30.2020(1012 records) and 3.31.2020(659)

#set variable options for Winter 2019 parsing
file_path='data_collection/html_files/sep_2019_all'
file_type='*.html'
sep_files = io.get_local_files(file_path,file_type)

collection_to_update = db.sep_entries__all

for sep_file in tqdm(sep_files):
    sep_object = sep.parse_sep(sep_file)
    collection_to_update.insert_one(sep_object)
    time.sleep(2)



In [4]:
### PARSING SEP FILES ###
### Spring 2020 SEP Edition
### Date(s) completed: 4.11.2020(67 files)

#set variable options for Spring 2020 parse
file_path='data_collection/html_files/sep_2020_spring'
file_type='*.html'
sep_files = io.get_local_files(file_path, file_type)

collection_to_update = db.sep_entries_2020_spring

#loop through all the html files in sep_files, parse each one, and then load the parsed data into MongoDB
for sep_file in tqdm(sep_files):
    sep_object = sep.parse_sep_file(sep_file)
    collection_to_update.insert_one(sep_object)
    time.sleep(2)

100%|██████████| 67/67 [03:21<00:00,  3.01s/it]
