In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Parsing & Categorizing HTML from `wget` run!

## Initializing

In [253]:
# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
from glob import glob,iglob # for finding files within nested folders
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages
from unicodedata import normalize

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS
bsparser = "lxml"


In [3]:
# ### Set script options

Debug = True # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML


In [4]:
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\Charter-school-identities\\"
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

if workstation and notebook:
    micro_sample13 = dir_prefix + "data\\micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
    full_schooldata = dir_prefix + "data\\charter_URLs_2014.csv" #data location for 2014 population of US charter schools
    example_file = dir_prefix + "data\\example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"
    dicts_dir = dir_prefix + "dicts\\" # Directory in which to find & save dictionary files
    save_dir = dir_prefix + "data\\" # Directory in which to save data files

else:
    wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel (requires server access)
    example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/"
    example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

    micro_sample13 = dir_prefix + "Charter-school-identities/data/micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
    full_schooldata = dir_prefix + "Charter-school-identities/data/charter_URLs_2014.csv" #data location for 2014 population of US charter schools
    dicts_dir = dir_prefix + "Charter-school-identities/dicts/" # Directory in which to find & save dictionary files
    save_dir = dir_prefix + "Charter-school-identities/data/" # Directory in which to save data files
    

In [5]:
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path. Aborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new file for this list of dictionaries.")
        usefile = False
    
    else:
        print("Response not interpretable. Aborting script.")
        sys.exit()

In [6]:
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==full_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print(e)
            print("ERROR: No data source established!\n")
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def save_to_file(dicts_list, file, mode):
    """Saves dicts_list to file using JSON or pickle format (whichever was specified)."""
    
    file = str(file)
    
    try:
        if mode=="JSON":
            if not file.endswith(".json"):
                file += ".json"
            with open(file, 'wb') as outfile:
                json.dump(dicts_list, outfile)
            #pickle.dump(dicts_list, outfile)
                print(dicts_list + " successfully saved to " + file + "in JSON format!\n")

        elif mode=="pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
            with open(file, 'wb') as outfile:
                pickle.dump(dicts_list, outfile)
                print(dicts_list + " successfully saved to " + file + " in pickle format!\n")

        else:
            print("ERROR! Save failed due to improper arguments. These are: file, object to be saved, and file format to save in.\n\
                  Specify either 'JSON' or 'pickle' as third argument ('mode' or file format) when calling this function.")
    
    except Exception as e:
        print(e)
    

def load_file(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    with open(file,'rb') as infile:
        if file.endswith(".json"):
            var = json.load(infile)
        if file.endswith(".pickle"):
            var = pickle.load(infile)
        print(file + " successfully loaded!\n")
    return var


def load_dict(custom_dict, file_path):
    """Loads in a dictionary. Adds each entry from the dict at file_path to the defined set custom_dict (the input), 
    which can also be an existing dictionary. This allows the creation of combined dictionaries!"""

    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            custom_dict.add(stemmer.stem(line.replace("\n", ""))) # Add line after stemming dictionary entries and eliminating newlines
            line = file_handler.readline() # Look for anything else in that line, add that too
    return custom_dict


def list_files(folder_path, extension):
    """Outputs a list of every file in folder_path or its subdirectories that has a specified extension.
    Prepends specified extension with '.' if it doesn't start with it already.
    If no extension is specified, it just returns all files in folder_path."""
    
    matches = []
    if extension:
        extension = str(extension) # Coerce to string, just in case
    
    if extension and not extension.startswith("."):
        extension = "." + extension
    
    for dirpath,dirnames,filenames in os.walk(folder_path):
        if extension:
            for filename in fnmatch.filter(filenames, "*" + extension): # Use extension to filter list of files
                matches.append(os.path.join(dirpath,filename))
        else:
                matches.append(os.path.join(dirpath,filename)) # If no extension, just take all files
    return matches


In [7]:
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)
keys_dict = set(stemmer.stem(key) for key in keywords)
    
if Debug:
    print("\nList of keywords:\n", list(keys_dict))



List of keywords:
 ['creed', 'valu', 'our stori', 'school stori', 'our school', 'structur', 'our purpos', 'approach', 'philosoph', 'background', 'doors open', 'establish', 'believ', 'curriculum', 'we began', 'academ', 'system', 'histori', 'model', 'curricular', 'school open', 'ideal', 'our id', 'who we ar', 'profil', 'the stori', 'skill', 'about u', 'our school began', 'highlight', 'found', 'pedagog', 'our caus', 'purpos', 'pedagogi', 'moral', 'direct', 'mission', 'vision', 'belief', 'philosophi', 'principl', 'method', 'credo', 'our ident']


In [8]:
# To use with filtering, create combined dictionary for ideologies:

ideol_dict = set()
ideol_dict = load_dict(ideol_dict, dicts_dir + "ess_dict.txt")
ideol_dict = load_dict(ideol_dict, dicts_dir + "prog_dict.txt")

if Debug:
    print(len(ideol_dict), "entries loaded into the combined ideology dictionary.")
    list_dict = list(ideol_dict)
    list_dict.sort(key = lambda x: x.lower())
    print("First 10 elements of combined ideology dictionary are:\n", list_dict[:10])

481 entries loaded into the combined ideology dictionary.
First 10 elements of combined ideology dictionary are:
 ['abstract think', 'abstract thought', 'account', 'achievement gain', 'achievement gap', 'activi', 'adapt', 'agricult', 'anim', "another's sho"]


In [9]:
# ### Compare parsing by newlines vs. by HTML tags

def parseurl_by_newlines(urlstring):
    """Uses BS to parse HTML from a given URL and looks for three newlines to separate chunks of text."""
    
    # Read HTML from a given url:
    with urllib.request.urlopen(urlstring) as url:
        s = url.read()
    
    # Parse raw text from website body:
    soup = BeautifulSoup(s, bsparser)
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    webtext = u" ".join(t.strip() for t in visible_texts)
    
    return re.split(r'\s{3,}', webtext)


def parseurl_by_tags(urlstring):
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from the web using a given website address, urlstring."""
    
    with urllib.request.urlopen(urlstring) as url:
        HTML_page = url.read()

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(HTML_page, bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    visible_text = list(elem.replace("\t","") for elem in visible_text.split(random_string)) # Split text into list using random string while eliminating tabs
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join
    
    return(visible_text)


# Text chunking accuracy of parsing by tags is superior to parsing by newlines:
# Compare each of these with the browser-displayed content of example_page:
if Debug:
    print(parseurl_by_newlines(example_page),"\n\n",parseurl_by_tags(example_page))
    

['', 'Contact Us: 2680 Mabry Dr. 95835 (916) 567-5760 | admin@westlakecharter.com', 'Staff Login', 'About', 'Administration & Support Staff  Employment Opportunities  Business Services  Admission & Enrollment Information  Supply Donations  Hot Lunch Payments  Hot Lunch Menu', 'Board', 'Board Policies  Board Committees  Board Documents', 'WCS', 'Infinite Campus Login  School Dismissal Manager  Ways to Westlake', 'Teachers  BASE  WAVE', 'Meetings  Contact Us  Volunteer Opportunities  Volunteer Hours  Log Volunteer Hours  Spirit Store', 'Calendars', '17/18 School Calendar  18/19 School Calendar', 'About', 'About  Aimee Wells  2013-09-18T18:36:35+00:00', 'About Westlake Charter Schools  Key School Features', 'International Focus  Thematic Curriculum  Artistic Development  Foreign Language Instruction  School-Wide Enrichment Model', 'Core Values', 'Respect  Excellence  Responsibility  Reflective  Global Perspective  Stewardship  Perseverance  Inquisitive  Joyful Learning  Gratitude', 'Calen

In [266]:
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    visible_text = list(elem.replace("\t","").replace(u'\xa0', u' ') for elem in visible_text.split(random_string)) # Split text into list using random string while eliminating tabs and unicode; OR: normalize("NFKC", elem) 
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements and unicode. 
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)


In [267]:
if Debug:
    example_textlist = parsefile_by_tags(example_file)
    print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")


Output of parsefile_by_tags:

 ['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Gr

In [268]:
def filter_dict_page(pagetext_list, keyslist):
    
    """Filters webtext of a given .html page, which is parsed and in list format, to only those strings 
    within pagetext_list containing an element (word or words) of inputted keyslist. 
    Returns list filteredtext wherein each element has original case (not coerced to lower-case)."""
    
    filteredtext = [] # Initialize empty list to hold strings of page
    
    for string in pagetext_list:
        lowercasestring = str(string).lower() # lower-case string...
        dict_list = [key.lower() for key in list(keyslist)] # ...compared with lower-case element of keyslist
        for key in dict_list:
            if key in lowercasestring and key in lowercasestring.split(' '): # Check that the word is the whole word not part of another one
                filteredtext.extend(string)

    return filteredtext


In [269]:
if Debug:
    print("Output of filter_keywords_page with keywords:\n\n", filter_dict_page(example_textlist, keys_dict), "\n\n")
    
    print("Output of filter_keywords_page with ideology words:\n\n", filter_dict_page(example_textlist, ideol_dict), "\n\n")


Output of filter_keywords_page with keywords:

 ['21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years of teaching experience into our classrooms, and many have worked together in other settings. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state. Two Associated Arts courses are offered to each student every semester, including music and media programs.', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff. Whenever possible, these lessons will take place in the arena in which they are practiced.', 'Vision'] 


Output of filter_keywords_page with ideology words:

 ['School Uniform Order Form', 'School Uniform Order Form', '21st Century is a charter middle school. We have been a school si

In [270]:
def parse_school(school_dict, school_name, school_address, school_URL, datalocation, parsed, numschools):
    
    """This core function parses webtext for a given school, using helper functions to run analyses and then saving multiple outputs to school_dict:
    full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
    filtered webtext, by keeping only those parsed text elements containing a keyword in previously defined keywords list, via filter_keywords_page();
    parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps();
    all text associated with specific categories by filtering webtext according to keywords for 
    mission, curriculum, philosophy, history, and about/general self-description, via categorize_page(); and
    contents of those individual pages best matching each of these categories, via find_best_categories."""
    
    global itervar # This allows function to access global itervar counter
    itervar+=1
    
    print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    school_dict["webtext"], school_dict["keywords_text"], school_dict["ideology_text"], school_dict["duplicate_flag"], school_dict["parse_error_flag"], school_dict["wget_fail_flag"] = [], [], [], 0, 0, 0
    
    folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
    school_dict["folder_name"] = folder_name
    
    school_folder = datalocation + folder_name + "/"

    # Check if folder exists. If not, exit function
    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
        print("!! NO DIRECTORY FOUND matching " + str(school_folder) + ".\n  Aborting parsing function...\n\n")
        school_dict['wget_fail_flag'] = 1
        return
    
    if school_URL not in parsed: #check if this URL has already been parsed. If so, skip this school to avoid duplication bias
        parsed.append(school_URL)
        
        try:
            file_count = 0 # initialize count of files parsed
            
            # Parse file only if it contains HTML. This is easy: use the "*.html" wildcard pattern--
            # also wget gave the ".html" file extension to appropriate files when downloading (`--adjust-extension` option)
            # Less efficient ways to check if files contain HTML (e.g., for data not downloaded by wget):
            # if bool(BeautifulSoup(open(fname), bsparser).find())==True: # if file.endswith(".html"):
            # Another way to do this, maybe faster but broken: files_iter = iglob(school_folder + "**/*.html", recursive=True)
            
            file_list = list_files(school_folder, ".html")
            
            if file_list==(None or school_folder) or not file_list:
                print("ERROR! File gathering function broken!\n  Aborting parser for " + str(school_name) + "...")
                return
            
            elif file_list==("" or []):
                print("  No .html files found.\n  Aborting parser for " + str(school_name) + "...")
                return
            
            for file in file_list:
                                    
                file_count+=1 # add to count of parsed files
                if Debug:
                    print("    Parsing HTML in " + str(file) + "...")
                    
                try:                    
                    parsed_pagetext = parsefile_by_tags(file) # Parse page text (filter too?)
                    if Debug:
                        print("      Successfully parsed page text by tags!")
                        
                    school_dict["webtext"].extend(parsed_pagetext) # Add new parsed text to long list

                    school_dict["keywords_text"].extend(filter_dict_page(parsed_pagetext, keys_dict)) # Filter parsed file using keywords list
                    school_dict["ideology_text"].extend(filter_dict_page(parsed_pagetext, ideol_dict)) # Filter parsed file using keywords list

                    if Debug:
                        print("      Successfully parsed and filtered file " + str(file) + "...")
                        
                    file_count+=1
                        
                    continue

                except Exception as e:
                    if Debug:
                        print("      ERROR! Failed to parse file...")
                        print("      ",e)
                        continue
                    else:
                        continue
            
            if Debug:
                print("  Parsed page text for " + str(file_count-1) + " .html file(s) belonging to " + str(school_name) + "...")
            
            print("SUCCESS! Parsed and categorized website text for " + str(school_name) + "...\n\n")
            return

        except Exception as e:
            print("    ERROR! Failed to parse & categorize webtext of " + str(school_name))
            print("    ",e)
            school_dict["parse_error_flag"] = 1
    
    else:
        print("DUPLICATE URL DETECTED. Skipping " + str(school_name) + "...\n\n")
        school_dict["duplicate_flag"] = 1
        return


In [271]:
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile and not dicts_list:
    dicts_list = load_file(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    
    data_loc = full_schooldata # Run at scale using URL list of full charter population
    # data_loc = micro_sample13 # This seems nice for debugging--except directories don't match because different data source
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
        
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables

In [272]:
# ### Run parsing algorithm on schools (requires access to webcrawl output)

test_dicts = dicts_list[:1] # Limit number of schools to analyze, in order to refine methods

if Debug:
    for school in test_dicts:
        parse_school(school, school[NAME_var], school[ADDR_var], school[URL_var], wget_dataloc, parsed, len(dicts_list))
        
else:
    for school in dicts_list:
        parse_school(school, school[NAME_var], school[ADDR_var], school[URL_var], wget_dataloc, parsed, len(dicts_list))

Parsing 21st Century Charter Sch of Gary, which is school #1 of 6752...
    Parsing HTML in /home/jovyan/work/wget/parll_wget/21st_Century_Charter_Sch_of_Gary_IN/www.21cchartergary.org/default.tmp.html...
      Successfully parsed page text by tags!
      Successfully parsed and filtered file /home/jovyan/work/wget/parll_wget/21st_Century_Charter_Sch_of_Gary_IN/www.21cchartergary.org/default.tmp.html...
  Parsed page text for 1 .html file(s) belonging to 21st Century Charter Sch of Gary...
SUCCESS! Parsed and categorized website text for 21st Century Charter Sch of Gary...




In [273]:
# Check out results:
if Debug:
    print(test_dicts[0])
else:
    print(dicts_list[0])
    

OrderedDict([('SEARCH', '21st Century Charter Sch of Gary 556 Washington St, Gary, IN 46402'), ('MANUAL_URL', ''), ('ADDRESS', '556 Washington St, Gary, IN 46402'), ('TRUE_URL', 'http://www.21cchartergary.org/'), ('CONFIRMED_CLOSED', '0'), ('SCH_NAME', '21st Century Charter Sch of Gary'), ('OLD_URL', 'http://www.21ccharter.org/'), ('NCESSCH', '1.80E+11'), ('STABR', 'IN'), ('webtext', ['Home', 'About Us', 'About Us', 'Principal Welcome', 'Teachers & Staff', 'School Board', 'Board Minutes', 'Careers', 'News', 'Our Approach', 'Academics', 'Free College', 'Technology', 'Student Life', 'Our Graduates', 'Enroll Your Child', 'Current Parents', 'PowerSchool', 'School Calendar', 'Meal Menus', 'Guidance Corner', 'Odyssey Login', 'PAWSS', 'Student Portal', 'Facebook', 'Twitter', 'Tweets by @twitter', 'dual-diagnosis-help.com', 'Latest News', 'NWI TIMES: U.S. Education Secretary Betsy DeVos puts Gary on the map', '2017 Tours to various Colleges for all in grades 7 to 12. Plan to Attend: Click here

In [None]:
# Save output:
if Debug:
    dictfile = "testing_dicts_" + str(datetime.today())
    save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
    dictfile = "school_dicts_" + str(datetime.today())
    save_to_file(dicts_list, save_dir+dictfile, "JSON")
