In [38]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Dictionary Analysis on HTML from `wget` run!

## Initializing

In [39]:
# import necessary libraries
import os, re # for navigating file trees and working with strings
import csv # for reading in CSV files
#from glob import glob # for finding files within nested folders--compare with os.walk
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages
from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS, compared to "html.parser"
bsparser = "lxml"


In [40]:
# ### Set script options

Debug = True # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = True # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML


In [41]:
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\Charter-school-identities\\" # One level further down than the others
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

if workstation and notebook:
    micro_sample13 = dir_prefix + "data\\micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
    URL_schooldata = dir_prefix + "data\\charter_URLs_2014.csv" #data location for 2014 population of US charter schools
    full_schooldata = dir_prefix + "data\\charter_merged_2014.csv"
    example_file = dir_prefix + "data\\example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"
    dicts_dir = dir_prefix + "dicts\\" # Directory in which to find & save dictionary files
    save_dir = dir_prefix + "data\\" # Directory in which to save data files

else:
    wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel (requires server access)
    example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/"
    example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

    micro_sample13 = dir_prefix + "Charter-school-identities/data/micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
    URL_schooldata = dir_prefix + "Charter-school-identities/data/charter_URLs_2014.csv" #data location for 2014 population of US charter schools
    full_schooldata = dir_prefix + "Charter-school-identities/data/charter_merged_2014.csv"
    dicts_dir = dir_prefix + "Charter-school-identities/dicts/" # Directory in which to find & save dictionary files
    save_dir = dir_prefix + "Charter-school-identities/data/" # Directory in which to save data files

In [42]:
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path. Aborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new file for this list of dictionaries.")
        usefile = False
    
    else:
        print("Response not interpretable. Aborting script.")
        sys.exit()

In [43]:
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==full_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print(e)
            print("ERROR: No data source established!\n")
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def save_to_file(dicts_list, file, mode):
    """Saves dicts_list to file using JSON or pickle format (whichever was specified)."""
    
    file = str(file)
    mode = str(mode)
    
    try:
        if mode.upper()=="JSON":
            if not file.endswith(".json"):
                file += ".json"
            with open(file, 'w') as outfile:
                json.dump(dicts_list, outfile)
                print("Dicts saved to " + file + " in JSON format!\n")

        elif mode.lower()=="pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
            with open(file, 'wb') as outfile:
                pickle.dump(dicts_list, outfile)
                print("Dicts saved to " + file + " in pickle format!\n")

        else:
            print("ERROR! Save failed due to improper arguments. These are: file, object to be saved, and file format to save in.\n\
                  Specify either 'JSON' or 'pickle' as third argument ('mode' or file format) when calling this function.")
    
    except Exception as e:
        print(e)
    

def load_file(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    if file.lower().endswith(".json"):
        with open(file,'r') as infile:
            var = json.load(infile)
    
    if file.lower().endswith(".pickle"):
        with open(file,'rb') as infile:
            var = pickle.load(infile)
        
    print(file + " successfully loaded!\n")
    return var


def load_dict(custom_dict, file_path):
    """Loads in a dictionary. Adds each entry from the dict at file_path to the defined set custom_dict (the input), 
    which can also be an existing dictionary. This allows the creation of combined dictionaries!"""

    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            custom_dict.add(stemmer.stem(line.replace("\n", ""))) # Add line after stemming dictionary entries and eliminating newlines
            line = file_handler.readline() # Look for anything else in that line, add that too
    return custom_dict


def list_files(folder_path, *extension):
    """Outputs a list of every file in folder_path or its subdirectories that has a specified extension.
    Prepends specified extension with '.' if it doesn't start with it already.
    If no extension is specified, it just returns all files in folder_path."""
    
    matches = []
    if extension:
        extension = str(extension) # Coerce to string, just in case
    
    if extension and not extension.startswith("."):
        extension = "." + extension
    
    for dirpath,dirnames,filenames in os.walk(folder_path):
        if extension:
            for filename in fnmatch.filter(filenames, "*" + extension): # Use extension to filter list of files
                matches.append(os.path.join(dirpath,filename))
        else:
            for filename in filenames:
                matches.append(os.path.join(dirpath,filename)) # If no extension, just take all files
    return matches


In [44]:
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

# Create sets for each aspect and one for all keywords
mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)
all_keywords = set(stemmer.stem(key) for key in keywords)

if Debug:
    print("\nList of keywords:\n", list(all_keywords))



List of keywords:
 ['ideal', 'belief', 'we began', 'profil', 'method', 'our school', 'curriculum', 'school open', 'doors open', 'curricular', 'pedagog', 'direct', 'believ', 'approach', 'found', 'pedagogi', 'histori', 'purpos', 'structur', 'system', 'our stori', 'philosoph', 'our ident', 'our caus', 'model', 'the stori', 'our school began', 'about u', 'establish', 'vision', 'our purpos', 'philosophi', 'background', 'creed', 'who we ar', 'moral', 'academ', 'principl', 'credo', 'skill', 'mission', 'our id', 'valu', 'highlight', 'school stori']


In [45]:
# ### Create dictionaries for each ideology and one for combined ideologies

ess_dict, prog_dict, rit_dict, all_ideol = set(), set(), set(), set()
all_ideol = load_dict(all_ideol, dicts_dir + "ess_dict.txt")
all_ideol = load_dict(all_ideol, dicts_dir + "prog_dict.txt")
ess_dict = load_dict(ess_dict, dicts_dir + "ess_dict.txt")
prog_dict = load_dict(prog_dict, dicts_dir + "prog_dict.txt")
rit_dict = load_dict(rit_dict, dicts_dir + "rit_dict.txt")

if Debug:
    print(len(all_ideol), "entries loaded into the combined ideology dictionary.")
    list_dict = list(all_ideol)
    list_dict.sort(key = lambda x: x.lower())
    print("First 10 elements of combined ideology dictionary are:\n", list_dict[:10])

481 entries loaded into the combined ideology dictionary.
First 10 elements of combined ideology dictionary are:
 ['abstract think', 'abstract thought', 'account', 'achievement gain', 'achievement gap', 'activi', 'adapt', 'agricult', 'anim', "another's sho"]


In [46]:
# ### Define list of tuples: keywords lists and their titles, for dictionary analyses

titles_list = ("mission","curriculum","philosophy","history","about","ideology","keywords")
keysnames_tupzip = zip((mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,\
                              all_ideol,all_keywords), titles_list)

dictsnames_list = ("ess", "prog", "rit", "all_ideol")
dictsnames_tupzip = zip((ess_dict,prog_dict,rit_dict,all_ideol), dictsnames_list)

if Debug:
    print(list(keysnames_tupzip))
    print()
    print(list(dictsnames_tupzip))

[({'vision:', 'vision', 'ideals:', 'our purpos', 'mission', 'object', 'our id', 'cause:', 'goal', 'mission:', 'our caus'}, 'mission'), ({'model', 'approach', 'pedagogi', 'pedagog', 'structur', 'method', 'curriculum', 'system', 'curricular', 'program'}, 'curriculum'), ({'believ', 'credo', 'belief', 'philosophi', 'creed', 'valu', 'moral', 'philosoph', 'principl'}, 'philosophy'), ({'stori', 'the stori', 'our school began', 'establish', 'found', 'we began', 'background', 'histori', 'school open', 'our stori', 'school stori', 'doors open'}, 'history'), ({'overview', 'about u', 'profil', 'our school', 'who we ar', 'highlight', 'our ident', 'general inform'}, 'about'), ({'free-think', 'suspens', 'well-b', 'gap in incom', 'disabl', 'emot', 'measur', 'jung', 'independen', 'surround', 'craftspeopl', 'spoon f', 'unorthodox', 'behavior cod', 'tame', 'impoverish', 'dedicat', 'penal', 'social act', 'prepared for colleg', 'diagnost', 'at-wil', 'culpab', 'own pac', 'spiritu', 'child-cent', 'college co

In [47]:
if Debug and not workstation:
    print("Output of find_best_categories:\n\n", find_best_categories(example_folder), "\n\n" )
    

NameError: name 'find_best_categories' is not defined

In [48]:
# ### Possible inspiration (from full webparsing script):
"""
        try:
            for file in glob(school_folder + "**", recursive=True) if file.endswith(".html"): 
                # Parse file only if it contains HTML. This is easy: wget gave the ".html" file extension to appropriate files when downloading
                #if bool(BeautifulSoup(open(fname), bsparser).find())==True: # More inefficient way to check if file contains HTML, for data not downloaded by wget
                if Debug:
                    print("    Parsing HTML in " + str(file) + "...")
                try:                    
                    parsed_pagetext = parsefile_by_tags(file) # Parse page text (filter too?)
                    school_dict["webtext"].extend(parsed_pagetext) # Add new parsed text to long list

                    mission_text,curr_text,phil_text,hist_text,about_text = "","","","","" # Initialize new additions to school's categories
                    mission_text,curr_text,phil_text,hist_text,about_text = categorize_page(parsed_pagetext) # Parse page text into the five categories
                    school_dict['mission'].extend(mission_text) # Add new text to categories for school
                    school_dict['curriculum'].extend(curr_text)
                    school_dict['philosophy'].extend(phil_text)
                    school_dict['history'].extend(hist_text)
                    school_dict['about'].extend(about_text)
                        
                    school_dict["filtered_text"].extend(filter_keywords_page(parsed_pagetext)) # Filter parsed file using keywords list
                        
                    if Debug:
                        print("    Successfully parsed & categorized file...\n\n")
                        
                except Exception as e:
                    if Debug:
                        print("      ERROR! Failed to parse & categorize file...")
                        print("      ",e)
                    else:
                        continue
                              
            print("  Successfully parsed & categorized website text...\n\n")

        except Exception as e:
            print("    ERROR! Failed to parse & categorize webtext of " + str(school_name))
            print("    ",e)
            school_dict["parse_error_flag"] = 1
            """

'\n        try:\n            for file in glob(school_folder + "**", recursive=True) if file.endswith(".html"): \n                # Parse file only if it contains HTML. This is easy: wget gave the ".html" file extension to appropriate files when downloading\n                #if bool(BeautifulSoup(open(fname), bsparser).find())==True: # More inefficient way to check if file contains HTML, for data not downloaded by wget\n                if Debug:\n                    print("    Parsing HTML in " + str(file) + "...")\n                try:                    \n                    parsed_pagetext = parsefile_by_tags(file) # Parse page text (filter too?)\n                    school_dict["webtext"].extend(parsed_pagetext) # Add new parsed text to long list\n\n                    mission_text,curr_text,phil_text,hist_text,about_text = "","","","","" # Initialize new additions to school\'s categories\n                    mission_text,curr_text,phil_text,hist_text,about_text = categorize_page(pa

## Existing/ non-dictionary-based parsing helper functions

In [49]:
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    # Split text into list using random string while also eliminating tabs and converting unicode to readable text:
    visible_text = list(normalize("NFKC",elem.replace("\t","")) for elem in visible_text.split(random_string))
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)


In [50]:
if Debug:
    example_textlist = parsefile_by_tags(example_file)
    print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")


Output of parsefile_by_tags:

 ["ÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\x1aö']Zè|òa ̄KÔá1⁄2s{mMÙ\x9fú&kÉÔË\x9cp3p\r\roRáÓ\x83\x08B1< ̄K/}n ̄ymB-\x18\x8ck·μÍgðjÜÍç.\x9cä,Ý4^[\x03\x15°\

In [51]:
def filter_keywords_page(pagetext_list, custom_dict):
    
    """Filters webtext of a given .html page, which is parsed and in list format, to only those strings 
    within pagetext_list containing an element (word or words) of inputted custom_dict (assumed lower-case)."""
    
    validcharacters = [' ', '.', '?']
    filteredtext = []  
    
    for string in pagetext_list:
        lowercasestring = string.lower()
        dict_list = list(custom_dict)
        for key in dict_list:
            if key in lowercasestring:
                if key in lowercasestring.split(' '): #check that the word is the whole word not part of another one
                    filteredtext.extend([string.lower()])

    filteredtext = list(set(filteredtext))
    finaltext = []
    for x in filteredtext:
        finaltext.append(x.replace('\xa0', " ")) # Clean up any remaining (non-readable) unicode
    return finaltext


In [52]:
if Debug:
    print("Output of filter_keywords_page:\n\n", filter_keywords_page(example_textlist, all_keywords), "\n\n")


Output of filter_keywords_page:

 ["ôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x15°\x1aö']zè|òa ̄kôá1⁄2s{mmù\x9fú&kéôë\x9cp3p\r\roráó\x83\x08b1< ̄k/}n ̄ymb-\x18\x8ck·μígðjüíç.\x9cä,ý4^[\x03\x1

In [18]:
def categorize_page(pagetext_list): 
    
    """Takes in a list of all the relevant (filtered) text from a given webpage. 
    Categorizes each block of text by scoring based on keyword count, using already-defined lists of keywords per category:
    mission, philosophy, curriculum, history, "about"/general self-description, combined ideology, and all keywords."""
    
    mission_list = []
    curriculum_list = []
    philosophy_list = []
    history_list = []
    about_list = []
    ideol_list = []
    keys_list = []
    
    for string in pagetext_list:
        mission_score, curriculum_score, philosophy_score, history_score, about_score, ideol_score, keys_score = 0, 0, 0, 0, 0, 0, 0
        for word in mission_keywords:
            mission_score+=string.count(word)
            if 'mission' in string.lower():
                mission_score = 2
                
        for word in curriculum_keywords:
            curriculum_score+=string.count(word)
            if 'curriculum' in string.lower():
                curriculum_score = 2
                
        for word in philosophy_keywords:
            philosophy_score+=string.count(word)
            if 'philosophy' in string.lower() or 'value' in string.lower():
                philosophy_score = 2
        
        for word in history_keywords:
            history_score+=string.count(word)
            if 'history' in string.lower():
                history_score = 2
        
        for word in about_keywords:
            about_score+=string.count(word)
            if 'about us' in string.lower() or "about-us" in string.lower():
                about_score = 2
        
        for word in all_ideol:
            ideol_score+=string.count(word)
        
        #for word in all_keywords:
        #    keys_score+=string.count(word)
        
        if mission_score>=2:
            mission_list.append(string)
        if curriculum_score>=2:
            curriculum_list.append(string)
        if philosophy_score>=2:
            philosophy_list.append(string)
        if history_score>=2:
            history_list.append(string)
        if about_score>=2:
            about_list.append(string)
        if ideol_score>=2:
            ideol_list.append(string)
        #if keys_score>=2:
        #    keys_list.append(string)
        if ((mission_score + curriculum_score + philosophy_score + about_score) >=2): 
            keys_list.append(string) # Impute keywords counting using its ideological constitutent elements--which excludes history_score
        
    #return {'mission': mission_list, 'curriculum' : curriculum_list, 'philosophy': philosophy_list, 'history': history_list, 'about': about_list}
    return mission_list, curriculum_list, philosophy_list, history_list, about_list, ideol_list, keys_list


In [19]:
if Debug:
    print("Output of categorize_page:\n\n", categorize_page(example_textlist), "\n\n")


Output of categorize_page:

 (["¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾jW\x1c\x9dù\x122#\x1fÅ|«\x07\x0f\x81ó\\²F\x15a\x1aW}V¦õ¯½I>;»\x8aéß\x8dnð¹áS#4\xad'\x91°B¬ä®ÑîÅÚê93È\x16³7\x08\x7f]\x83¬_¤\x93\x96Ê¾j

In [9]:
# ### Define dictionary matching helper functions

def dict_match(phrase, custom_dict):
    """Performs dictionary analysis, returning number of dictionary hits found.
    Removes punctuation and stems the phrase being analyzed. Compatible with multiple-word dictionary elements."""
    
    max_entry_length = max([len(entry.split()) for entry in custom_dict]) # Get length in words of longest entry in combined dictionary.
    
    # regex to keep only letters and spaces. Effectively removes punctuation
    phrase = re.sub(r'[^\w\s]', '', phrase)
    
    # Do dictionary analysis for word chunks of lengths 1 to max_entry_length
    counts = 0
    for length in range(max_entry_length, 0, -1):
        phrase, len_counts = dict_match_len(phrase, custom_dict, length)
        counts += len_counts
    return phrase, int(counts)

def dict_match_len(phrase, custom_dict, length):
    """Helper function to dict_match. 
    Returns # dictionary hits and updated copy of phrase with dictionary hits removed. 
    Stems phrases before checking for matches."""
    
    splitted_phrase = phrase.split()
    if len(splitted_phrase) < length:
        return phrase, 0
    hits_indices, counts = [], 0
    for i in range(len(splitted_phrase) - length + 1):
        to_stem = ""
        for j in range(length):
            to_stem += splitted_phrase[i+j] + " " # Builds chunk of 'length' words
        stemmed_word = ps.stem(to_stem[:-1]) # stem chunk
        if stemmed_word in custom_dict:
            hits_indices.append(i) # Store the index of the word that has a dictionary hit
            counts += 1
            if Debug:
                print(stemmed_word)
    # Iterate through list of matching word indices and remove the matches
    for i in range(len(hits_indices)-1, -1, -1):
        splitted_phrase = splitted_phrase[:hits_indices[i]] + \
        splitted_phrase[hits_indices[i] + length:]
    modified_phrase = ""
    for sp in splitted_phrase: # Rebuild the modified phrase, with matches removed
        modified_phrase += sp + " "
    return modified_phrase[:-1], counts

In [10]:
def find_best_categories(folder_path):
    
    """Parse through all HTML files in foldername to find and save best pages for each category: 
    mission, curriculum, philosophy, history, about/general self-description."""
    
    list_pages = [file for file in glob(folder_path + "**", recursive=True) if file.endswith(".html")] # Keep only HTML files
    num_pages = len(files_list)
    max_page_score = (-1, -1)
    
    for i in range(num_pages):
        page_text = parsefile_by_tags(list_pages[i])

        if len(page_text) != 0:
            page_score = dict_match(page_text, custom_dict) / len(page_text.split())
            if page_score > max_page_score[0]:
                max_page_score = (page_score, i)
    max_text = open(filtered_file_format.format(max_page_score[1])).read()
    
    print("Page with the highest dictionary score:\n\n" + max_text)
    
    return mission_page,curr_page,phil_page,hist_page,about_page,ideol_page,keywords_page

In [None]:
def find_best_categories(onedict, orgname, files_list, tuplist_zip):
    """description"""
    
    # TO DO: Remodel find_best_categories to work with single keyword list

    for keylist,title in list(tuplist_zip):
        bestvar_name = title + "_best" # assign varname to use as dict key

        school_dict[bestvar_name] = "" # initialize dict key/value pair as empty string
        
    try:
        file_count = 0
        for file in files_list:
            if Debug:
                print("Testing from within file for loop!")
                print(file)
        
            if file.endswith(".html"):
                # Parse file only if it contains HTML. This is easy: wget gave the ".html" file extension to appropriate files when downloading (`--adjust-extension` option)
                #if bool(BeautifulSoup(open(fname), bsparser).find())==True: # More inefficient way to check if file contains HTML, for data not downloaded by wget
                file_count+=1
                if Debug:
                    print("    Parsing HTML in " + str(file) + "...")
                try:                    
                    parsed_pagetext = parsefile_by_tags(file) # Parse page text (filter too?)
                    if Debug:
                        print("      Successfully parsed page text by tags!")
        if Debug:
            print("  Parsed page text for " + str(file_count) + " .html files while counting matches to find " + str(bestvar_name))
    
        # find max dict_match count!
    
    except Exception as e:
        print("    ERROR! Failed to find best pages while parsing webtext of " + str(orgname))
        print("    ",e)
        school_dict["parse_error_flag"] = 1
    
    
        
        

In [9]:
# ### Define dictionary matching helper functions

def dict_match(phrase, custom_dict):
    """Performs dictionary analysis, returning number of dictionary hits found.
    Removes punctuation and stems the phrase being analyzed. Compatible with multiple-word dictionary elements."""
    
    # TO DO: Remodel dict_match to work with single dict
    
    max_entry_length = max([len(entry.split()) for entry in custom_dict]) # Get length in words of longest entry in combined dictionary.
    
    # regex to keep only letters and spaces. Effectively removes punctuation
    phrase = re.sub(r'[^\w\s]', '', phrase)
    
    # Do dictionary analysis for word chunks of lengths 1 to max_entry_length
    counts = 0
    for length in range(max_entry_length, 0, -1):
        phrase, len_counts = dict_match_len(phrase, custom_dict, length)
        counts += len_counts
    return phrase, int(counts)

def dict_match_len(phrase, custom_dict, length):
    """Helper function to dict_match. 
    Returns # dictionary hits and updated copy of phrase with dictionary hits removed. 
    Stems phrases before checking for matches."""
    
    splitted_phrase = phrase.split()
    if len(splitted_phrase) < length:
        return phrase, 0
    hits_indices, counts = [], 0
    for i in range(len(splitted_phrase) - length + 1):
        to_stem = ""
        for j in range(length):
            to_stem += splitted_phrase[i+j] + " " # Builds chunk of 'length' words
        stemmed_word = ps.stem(to_stem[:-1]) # stem chunk
        if stemmed_word in custom_dict:
            hits_indices.append(i) # Store the index of the word that has a dictionary hit
            counts += 1
            if Debug:
                print(stemmed_word)
    # Iterate through list of matching word indices and remove the matches
    for i in range(len(hits_indices)-1, -1, -1):
        splitted_phrase = splitted_phrase[:hits_indices[i]] + \
        splitted_phrase[hits_indices[i] + length:]
    modified_phrase = ""
    for sp in splitted_phrase: # Rebuild the modified phrase, with matches removed
        modified_phrase += sp + " "
    return modified_phrase[:-1], counts

In [None]:
# Inspiration for dict best matching?

"""
school_dict['mission_text'],school_dict['curriculum_text'],school_dict['philosophy_text'],school_dict['history_text'],school_dict['about_text'],school_dict['ideology_text'],school_dict['keywords_text'] = "","","","",""

mission_text,curr_text,phil_text,hist_text,about_text,ideol_text,keys_text = "","","","","","","" # Initialize new additions to school's categories
                        
mission_text,curr_text,phil_text,hist_text,about_text,ideol_text,keys_text = categorize_page(parsed_pagetext) # Parse page text into the seven categories
                        
school_dict['mission_text'].append(mission_text) # Add new text to categories for school
school_dict['curriculum_text'].append(curr_text)
school_dict['philosophy_text'].append(phil_text)
school_dict['history_text'].append(hist_text)
school_dict['about_text'].append(about_text)
school_dict['ideology_text'].append(ideol_text)
school_dict['keywords_text'].append(keys_text)
"""

In [22]:
def parse_school(school_dict, school_name, school_address, school_URL, datalocation, parsed, itervar, numschools):
    
    """This core function parses webtext for a given school, using helper functions to run analyses and then saving multiple outputs to school_dict:
    counts of the number of matches between all text from a school's html pages and keywords from a defined keyword list, find dict_count();
    and text contents of those individual pages best matching such keywords, via find_best_categories (in development).
    
    For the sake of parsimony and manageable script calls, OTHER similar functions/scripts collect these additional outputs: 
    full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
    filtered webtext, by keeping only those parsed text elements containing a keyword in previously defined keywords list, via filter_keywords_page();
    and parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps()."""
    
    global itervar # This allows function to access global itervar counter
    itervar+=1
    
    print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    #school_dict['mission_best'],school_dict['curriculum_best'],school_dict['philosophy_best'],school_dict['history_best'],\
    #school_dict['about_best'],school_dict['ideology_best'],school_dict['keywords_best'] = "","","","","","",""
    #school_dict['ess_count'],school_dict['prog_count'],school_dict['rit_count'] = 0,0,0
    school_dict['ess_strength'],school_dict['prog_strength'] = 0,0
    if not usefile:
        school_dict["duplicate_flag"], school_dict["parse_error_flag"] = 0, 0
    
    folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
    school_folder = datalocation + folder_name + "/"
    
    # Check if folder exists. If not, exit function
    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
        print("!! NO DIRECTORY FOUND matching " + str(school_folder) + ".\n  Aborting parsing function...\n\n")
        school_dict['wget_fail_flag'] = 1
        return
    
    if school_URL not in parsed: #check if this URL has already been parsed. If so, skip this school to avoid duplication bias
        parsed.append(school_URL)
    
        find_best_categories(school_dict, school_name, file_list, keysnames_tupzip) # find pages corresponding to best categories for each keyword category in keysnames_tupzip
        
        try:
            file_count = 0 # initialize count of files parsed
            
            # Parse file only if it contains HTML. This is easy: use the "*.html" wildcard pattern--
            # also wget gave the ".html" file extension to appropriate files when downloading (`--adjust-extension` option)
            # Less efficient ways to check if files contain HTML (e.g., for data not downloaded by wget):
            # if bool(BeautifulSoup(open(fname), bsparser).find())==True: # if file.endswith(".html"):
            # Another way to do this, maybe faster but broken: files_iter = iglob(school_folder + "**/*.html", recursive=True)
            
            file_list = list_files(school_folder, ".html")
            
            if file_list==(None or school_folder) or not file_list:
                print("ERROR! File gathering function broken!\n  Aborting parser for " + str(school_name) + "...")
                return
            
            elif file_list==("" or []):
                print("  No .html files found.\n  Aborting parser for " + str(school_name) + "...")
                return
            
            # Find and save best pages for each of the categories:
            #for keywords,title in list(keysnames_tupzip):
            #    keylist_name = title + "_best"
            #    school_dict[keylist_name].extend(find_best_categories(school_folder,keywords))
                
            
            """
            mission_best,curr_best,phil_best,hist_best,about_best = "","","","","" # Initialize new dict elements
            mission_best,curr_best,phil_best,hist_best,about_best,ideol_best,keywords_best = find_best_categories(school_folder) # Parse page text into the five categories
            school_dict['mission_best'].extend(mission_best)
            school_dict['curriculum_best'].extend(curr_best)
            school_dict['philosophy_best'].extend(phil_best)
            school_dict['history_best'].extend(hist_best)
            school_dict['about_best'].extend(about_best)
            school_dict['ideology_best'].extend(ideol_best)
            school_dict['keywords_best'].extend(keywords_best)
            """
            
            #print("SUCCESS! Categorized website text for " + str(school_name) + "...\n")
            
        except Exception as e:
            print("    ERROR! Failed to find best pages while parsing webtext of " + str(school_name))
            print("    ",e)
            school_dict["parse_error_flag"] = 1
        
        try:
            for dict,name in list(dictsnames_tupzip):
                dict_name = name + "_count"
                school_dict[dict_name] = dict_count(school_folder,dict)
                
            """    
            school_dict['ess_count'] = dict_match(school_folder, ess_dict) # TO DO: Run this with phrase or for whole school? Where does phrase come from?
            school_dict['prog_count'] = dict_match(school_folder, prog_dict)
            school_dict['rit_count'] = dict_match(school_folder, rit_dict)
            """
            
            school_dict['ess_strength'] = float(school_dict['ess_count'])/float(school_dict['rit_count'])
            school_dict['prog_strength'] = float(school_dict['prog_count'])/float(school_dict['rit_count'])
        
        except:
            print("    ERROR! Failed to count number of dict matches while parsing webtext of " + str(school_name))
            print("    ",e)
            school_dict["parse_error_flag"] = 1
            return
        
        # TO DO: Build in other dictionary approaches here?
    
    else:
        print("DUPLICATE URL DETECTED. Skipping " + str(school_name) + "...\n\n")
        school_dict["duplicate_flag"] = 1
        return
    

In [None]:
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile and not dicts_list:
    dicts_list = load_file(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    
    data_loc = full_schooldata # Run at scale using URL list of full charter population
    # data_loc = micro_sample13 # This seems nice for debugging--except directories don't match because different data source
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
        
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables

In [None]:
# ### Run parsing algorithm on schools (requires access to webcrawl output)

test_dicts = dicts_list[0] # Limit number of schools to analyze, in order to refine methods

if Debug:
    for school in test_dicts:
        parse_school(school, school[NAME_var], school[ADDR_var], school[URL_var], data_loc, parsed, itervar, len(dicts_list))
        
else:
    for school in dicts_list:
        parse_school(school, school[NAME_var], school[ADDR_var], school[URL_var], data_loc, parsed, itervar, len(dicts_list))

In [None]:
# Check out results:
if Debug:
    print(test_dicts[0])
else:
    print(dicts_list[0])
    

In [None]:
# Save output:
if Debug:
    dictfile = "testing_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
    save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
    dictfile = "school_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
    save_to_file(dicts_list, save_dir+dictfile, "JSON")