In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Parsing & Categorizing HTML from `wget` run!

## Initializing

In [2]:
# import necessary libraries
import os, re # for navigating file trees and working with strings
import csv # for reading in CSV files
from glob import glob # for finding files within nested folders
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS
bsparser = "lxml"


In [3]:
# ### Set script options

Debug = True # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = True # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML

"""
# Set parser for BeautifulSoup to use depending on whether code is running in notebook or not 
# (notebooks don't have faster lxml parser installed)
if notebook:
    bsparser = "html.parser"
else:
    bsparser = "lxml"
"""

'\n# Set parser for BeautifulSoup to use depending on whether code is running in notebook or not \n# (notebooks don\'t have faster lxml parser installed)\nif notebook:\n    bsparser = "html.parser"\nelse:\n    bsparser = "lxml"\n'

In [4]:
# ### Set directories

if notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"
    
wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel
micro_sample13 = dir_prefix + "Charter-school-identities/data/micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
full_schooldata = dir_prefix + "Charter-school-identities/data/charter_URLs_2014.csv" #data location for 2014 population of US charter schools
save_dir = dir_prefix + "Charter-school-identities/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"
example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/"
example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

In [5]:
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path. Aborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new file for this list of dictionaries.")
        usefile = False
    
    else:
        print("Response not interpretable. Aborting script.")
        sys.exit()

In [6]:
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)


In [7]:
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==full_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print(e)
            print("ERROR: No data source established!\n")
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def save_to_file(dicts_list, file, mode):
    """Saves dicts_list to file using JSON or pickle format (whichever was specified)."""
    
    file = str(file)
    
    try:
        if mode=="JSON":
            if not file.endswith(".json"):
                file += ".json"
            with open(file, 'wb') as outfile:
                json.dump(dicts_list, outfile)
            #pickle.dump(dicts_list, outfile)
                print(dicts_list + " successfully saved to " + file + "in JSON format!\n")

        elif mode=="pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
            with open(file, 'wb') as outfile:
                pickle.dump(dicts_list, outfile)
                print(dicts_list + " successfully saved to " + file + " in pickle format!\n")

        else:
            print("ERROR! Save failed due to improper arguments. These are: file, object to be saved, and file format to save in.\n\
                  Specify either 'JSON' or 'pickle' as third argument ('mode' or file format) when calling this function.")
    
    except Exception as e:
        print(e)
    

def load_file(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    with open(file,'rb') as infile:
        if file.endswith(".json"):
            var = json.load(infile)
        if file.endswith(".pickle"):
            var = pickle.load(infile)
        print(file + " successfully loaded!\n")
    return var


In [8]:
# ### Compare parsing by newlines vs. by HTML tags

def parseurl_by_newlines(urlstring):
    """Uses BS to parse HTML from a given URL and looks for three newlines to separate chunks of text."""
    
    # Read HTML from a given url:
    with urllib.request.urlopen(urlstring) as url:
        s = url.read()
    
    # Parse raw text from website body:
    soup = BeautifulSoup(s, bsparser)
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    webtext = u" ".join(t.strip() for t in visible_texts)
    
    return re.split(r'\s{3,}', webtext)


def parseurl_by_tags(urlstring):
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from the web using a given website address, urlstring."""
    
    with urllib.request.urlopen(urlstring) as url:
        HTML_page = url.read()

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(HTML_page, bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    visible_text = list(elem.replace("\t","") for elem in visible_text.split(random_string)) # Split text into list using random string while eliminating tabs
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join
    
    return(visible_text)


# Text chunking accuracy of parsing by tags is superior to parsing by newlines:
# Compare each of these with the browser-displayed content of example_page:
if Debug:
    print(parseurl_by_newlines(example_page),"\n\n",parseurl_by_tags(example_page))
    

['', 'Contact Us: 2680 Mabry Dr. 95835 (916) 567-5760 | admin@westlakecharter.com', 'Staff Login', 'About', 'Administration & Support Staff  Employment Opportunities  Business Services  Admission & Enrollment Information  Supply Donations  Hot Lunch Payments  Hot Lunch Menu', 'Board', 'Board Policies  Board Committees  Board Documents', 'WCS', 'Infinite Campus Login  School Dismissal Manager  Ways to Westlake', 'Teachers  BASE  WAVE', 'Meetings  Contact Us  Volunteer Opportunities  Volunteer Hours  Log Volunteer Hours  Spirit Store', 'Calendars', '17/18 School Calendar  18/19 School Calendar', 'About', 'About  Aimee Wells  2013-09-18T18:36:35+00:00', 'About Westlake Charter Schools  Key School Features', 'International Focus  Thematic Curriculum  Artistic Development  Foreign Language Instruction  School-Wide Enrichment Model', 'Core Values', 'Respect  Excellence  Responsibility  Reflective  Global Perspective  Stewardship  Perseverance  Inquisitive  Joyful Learning  Gratitude', 'Calen

In [9]:
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    visible_text = list(elem.replace("\t","") for elem in visible_text.split(random_string)) # Split text into list using random string while eliminating tabs
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)


In [10]:
if Debug:
    example_textlist = parsefile_by_tags(example_file)
    print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")


Output of parsefile_by_tags:

 ['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Gr

In [11]:
def filter_keywords_page(pagetext_list):
    
    """Filters webtext of a given .html page, which is parsed and in list format, to only those strings 
    within pagetext_list containing an element (word or words) of a previously defined list of meaningful keywords."""
    
    validcharacters = [' ', '.', '?']
    filteredtext = []  
    
    for string in pagetext_list:
        lowercasestring = string.lower()
        for key in keywords:
            if key in lowercasestring:
                if key in lowercasestring.split(' '): #check that the word is the whole word not part of another one
                    filteredtext.extend([string.lower()])

    filteredtext = list(set(filteredtext))
    finaltext = []
    for x in filteredtext:
        finaltext.append(x.replace('\xa0', " ")) # Clean up any remaining (non-readable) unicode
    return finaltext


In [12]:
if Debug:
    print("Output of filter_keywords_page:\n\n", filter_keywords_page(example_textlist), "\n\n")


Output of filter_keywords_page:

 ['history', 'vision', 'it is the mission of 21st century public academy to continually search for positive learning experiences that enrich students and staff. whenever possible, these lessons will take place in the arena in which they are practiced.', 'mission', '21st century is a charter middle school. we have been a school since 2000. we serve a diverse population of nearly 70 students per grade. all staff bring years of teaching experience into our classrooms, and many have worked together in other settings. we emphasize the core curriculum of math, science, social studies, and language arts, as well as learning experiences in the community, city, and state. two associated arts courses are offered to each student every semester, including music and media programs.', 'academics'] 




In [13]:
def categorize_page(pagetext_list): 
    
    """Takes in a list of all the relevant (filtered) text from a given webpage. 
    Categorizes each block of text by scoring based on keyword count, using already-defined lists 
    of keywords per category--mission, philosophy, curriculum, history, and "about"/general self-description."""
    
    mission_list = []
    curriculum_list = []
    philosophy_list = []
    history_list = []
    about_list = []
    
    for string in pagetext_list:
        mission_score, curriculum_score, philosophy_score, history_score, about_score = 0, 0, 0, 0, 0
        for word in mission_keywords:
            mission_score+=string.count(word)
            if 'mission' in string.lower():
                mission_score = 2
                
        for word in curriculum_keywords:
            curriculum_score+=string.count(word)
            if 'curriculum' in string.lower():
                curriculum_score = 2
                
        for word in philosophy_keywords:
            philosophy_score+=string.count(word)
            if 'philosophy' in string.lower() or 'value' in string.lower():
                philosophy_score = 2
        
        for word in history_keywords:
            history_score+=string.count(word)
            if 'history' in string.lower():
                history_score = 2
        
        for word in about_keywords:
            about_score+=string.count(word)
            if 'about us' in string.lower() or "about-us" in string.lower():
                about_score = 2
        
        if mission_score>=2:
            mission_list.append(string)
        if curriculum_score>=2:
            curriculum_list.append(string)
        if philosophy_score>=2:
            philosophy_list.append(string)
        if history_score>=2:
            history_list.append(string)
        if about_score>=2:
            about_list.append(string)
        elif (mission_score + curriculum_score + philosophy_score + history_score + about_score >=2):
            about_list.append(string)
        
    #return {'mission': mission_list, 'curriculum' : curriculum_list, 'philosophy': philosophy_list, 'history': history_list, 'about': about_list}
    return mission_list, curriculum_list, philosophy_list, history_list, about_list


In [14]:
if Debug:
    print("Output of categorize_page:\n\n", categorize_page(example_textlist), "\n\n")


Output of categorize_page:

 (['Admissions', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff. Whenever possible, these lessons will take place in the arena in which they are practiced.'], ['21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years of teaching experience into our classrooms, and many have worked together in other settings. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state. Two Associated Arts courses are offered to each student every semester, including music and media programs.'], [], ['History'], ['Admissions', '21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years 

In [37]:
def find_best_categories(folder_path):
    
    """Parse through all HTML files in foldername to find and save best pages for each category: 
    mission, curriculum, philosophy, history, about/general self-description."""
    
    list_pages = [file for file in glob(folder_path + "**", recursive=True) if file.endswith(".html")] # Keep only HTML files
    num_pages = len(list_pages)
    max_page_score = (-1, -1)
    
    for i in range(num_pages):
        page_text = parsefile_by_tags(list_pages[i])

        if len(page_text) != 0:
            page_score = dict_match(page_text, custom_dict) / len(page_text.split())
            if page_score > max_page_score[0]:
                max_page_score = (page_score, i)
    max_text = open(filtered_file_format.format(max_page_score[1])).read()
    
    print("Page with the highest dictionary score:\n\n" + max_text)
    
    return mission_page,curr_page,phil_page,hist_page,about_page

In [38]:
if Debug:
    print("Output of find_best_categories:\n\n", find_best_categories(example_folder), "\n\n" )
    

NameError: name 'dict_match' is not defined

In [51]:
glob("/home/jovyan/work/wget/parll_wget/RICHLAND_TWO_CHARTER_HIGH_MB/" + "**", recursive=True)

['/home/jovyan/work/wget/parll_wget/RICHLAND_TWO_CHARTER_HIGH_MB/']

In [52]:
glob("/home/jovyan/work/wget/parll_wget/RICHLAND_TWO_CHARTER_HIGH_MB/" + "**", recursive=True)

[]

In [56]:
print(example_folder)
print(example_folder + "**")
print()
print(glob(example_folder + "**", recursive=True))

/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/
/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/**

['/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp/default?author=1.tmp.html', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp/default?paged=11.tmp.html', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp/default?p=1154.tmp.html', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp/default?p=1170.tmp.html', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp/default?p=1396.tmp.html', '/home/jovyan/work/wget/parll_wget/TWENTY-FIRST_CENTURY_NM/21stcenturypa.com/wp/default?paged=61.tmp.html', '/home/jovyan

In [44]:
def parse_school(school_dict, school_name, school_address, school_URL, datalocation, parsed, itervar, numschools):
    
    """This core function parses webtext for a given school, using helper functions to run analyses and then saving multiple outputs to school_dict:
    full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
    filtered webtext, by keeping only those parsed text elements containing a keyword in previously defined keywords list, via filter_keywords_page();
    parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps();
    all text associated with specific categories by filtering webtext according to keywords for 
    mission, curriculum, philosophy, history, and about/general self-description, via categorize_page(); and
    contents of those individual pages best matching each of these categories, via find_best_categories."""
    
    itervar+=1
    print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    school_dict['mission'],school_dict['curriculum'],school_dict['philosophy'],school_dict['history'],school_dict['about'] = "","","","",""
    school_dict["webtext"], school_dict["filtered_text"], school_dict["duplicate_flag"], school_dict["parse_error_flag"] = [], [], 0, 0
    
    folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
    school_folder = datalocation + folder_name + "/"
    
    if school_URL not in parsed: #check if this URL has already been parsed. If so, skip this school to avoid duplication bias
        parsed.append(school_URL)
        
        try:
            for file in glob(school_folder + "**", recursive=True):
                if Debug:
                    print("Testing from within file for loop!")
                    print(file)
                if file.endswith(".html"):
                    # Parse file only if it contains HTML. This is easy: wget gave the ".html" file extension to appropriate files when downloading (`--adjust-extension` option)
                    #if bool(BeautifulSoup(open(fname), bsparser).find())==True: # More inefficient way to check if file contains HTML, for data not downloaded by wget
                    if Debug:
                        print("    Parsing HTML in " + str(file) + "...")
                    try:                    
                        parsed_pagetext = parsefile_by_tags(file) # Parse page text (filter too?)
                        if Debug:
                            print("      Successfully parsed page text by tags!")
                        school_dict["webtext"].extend(parsed_pagetext) # Add new parsed text to long list

                        mission_text,curr_text,phil_text,hist_text,about_text = "","","","","" # Initialize new additions to school's categories
                        mission_text,curr_text,phil_text,hist_text,about_text = categorize_page(parsed_pagetext) # Parse page text into the five categories
                        school_dict['mission'].append(mission_text) # Add new text to categories for school
                        school_dict['curriculum'].append(curr_text)
                        school_dict['philosophy'].append(phil_text)
                        school_dict['history'].append(hist_text)
                        school_dict['about'].append(about_text)

                        school_dict["filtered_text"].extend(filter_keywords_page(parsed_pagetext)) # Filter parsed file using keywords list

                        if Debug:
                            print("      Successfully parsed & categorized file...\n\n")
                        
                        continue

                    except Exception as e:
                        if Debug:
                            print("      ERROR! Failed to parse & categorize file...")
                            print("      ",e)
                            continue
                        else:
                            continue
            
            print("SUCCESS! Parsed and categorized website text for " + str(school_name) + "...\n\n")
            return

        except Exception as e:
            print("    ERROR! Failed to parse & categorize webtext of " + str(school_name))
            print("    ",e)
            school_dict["parse_error_flag"] = 1
    
    else:
        print("DUPLICATE URL DETECTED. Skipping " + str(school_name) + "...\n\n")
        school_dict["duplicate_flag"] = 1
        return


In [49]:
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile:
    dicts_list = load_file(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    if Debug:
        data_loc = micro_sample13 # Run on micro-sample first, for debugging purposes
    else:
        data_loc = full_schooldata # Run at scale using URL list of full charter population
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
        
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables

In [50]:
# ### Run parsing algorithm on schools

test_dicts = dicts_list[:1] # Limit number of schools to analyze, in order to refine methods

if Debug:
    for school in test_dicts:
        parse_school(school, school[NAME_var], school[ADDR_var], school[URL_var], wget_dataloc, parsed, itervar, len(dicts_list))
        
else:
    for school in dicts_list:
        parse_school(school, school[NAME_var], school[ADDR_var], school[URL_var], wget_dataloc, parsed, itervar, len(dicts_list))

Parsing RICHLAND TWO CHARTER HIGH, which is school #1 of 300...
Testing from within file for loop!
/home/jovyan/work/wget/parll_wget/RICHLAND_TWO_CHARTER_HIGH_MB/
SUCCESS! Parsed and categorized website text for RICHLAND TWO CHARTER HIGH...




In [36]:
# Check out results:
if Debug:
    print(test_dicts[0])
else:
    print(dicts_list[0])
    

OrderedDict([('MS_ID', '1'), ('LEANM', 'RICHLAND 02'), ('SEARCH', 'RICHLAND TWO CHARTER HIGH 750 OLD CLEMSON RD, COLUMBIA, SC'), ('CER_NAME', 'Richland Two Charter High School'), ('URL', 'https://www.richland2.org/charterhigh/'), ('ADDRESS', '750 OLD CLEMSON RD, COLUMBIA, SC'), ('CUSTOM_ID', 'SC600'), ('LEVEL', '3'), ('YEAR_OPEN_CER', '2010'), ('CER_MS_2012', 'Flexible schedule allows students to work and explore different careers while receiving their high school diploma.'), ('SURVYEAR', '2013'), ('NCESSCH', '4.50E+11'), ('FIPST', '45'), ('LEAID', '4503390'), ('SCHNO', '1554'), ('STID', '4002'), ('SEASCH', '600'), ('SCHNAM', 'RICHLAND TWO CHARTER HIGH'), ('PHONE', '8034191348'), ('MSTREE', '750 OLD CLEMSON ROAD'), ('MCITY', 'COLUMBIA'), ('MSTATE', 'SC'), ('MZIP', '29229'), ('MZIP4', '0'), ('LSTREE', '750 OLD CLEMSON RD'), ('LCITY', 'COLUMBIA'), ('LSTATE', 'SC'), ('LZIP', '29229'), ('LZIP4', '0'), ('TYPE', '1'), ('STATUS', '1'), ('UNION', '0'), ('ULOCAL', '21'), ('LATCOD', '34.1231'), 

In [None]:
# Save output:
if Debug:
    dictfile = "testing_dicts_" + str(datetime.today())
    save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
    dictfile = "school_dicts_" + str(datetime.today())
    save_to_file(dicts_list, save_dir+dictfile, "JSON")
