In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Removing HTML Overlaps from `wget` run!

## Initializing

In [13]:
# import necessary libraries
import os, re # for navigating file trees and working with strings
import csv # for reading in CSV files
from glob import glob # for finding files within nested folders
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS
bsparser = "lxml"


In [18]:
# ### Set script options

Debug = True # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = True # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML

"""
# Set parser for BeautifulSoup to use depending on whether code is running in notebook or not 
# (notebooks don't have faster lxml parser installed)
if notebook:
    bsparser = "html.parser"
else:
    bsparser = "lxml"
"""

'\n# Set parser for BeautifulSoup to use depending on whether code is running in notebook or not \n# (notebooks don\'t have faster lxml parser installed)\nif notebook:\n    bsparser = "html.parser"\nelse:\n    bsparser = "lxml"\n'

In [4]:
# ### Set directories

if notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"
    
wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel
micro_sample13 = dir_prefix + "Charter-school-identities/data/micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
full_schooldata = dir_prefix + "Charter-school-identities/data/charter_URLs_2014.csv" #data location for 2014 population of US charter schools
save_dir = dir_prefix + "Charter-school-identities/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"
example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/"
example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"


In [5]:
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)


In [6]:
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path. Aborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new file for this list of dictionaries.")
        usefile = False
    
    else:
        print("Response not interpretable. Aborting script.")
        sys.exit()

In [7]:
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==full_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print(e)
            print("ERROR: No data source established!\n")
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def save_to_file(dicts_list, file, mode):
    """Saves dicts_list to file using JSON or pickle format (whichever was specified)."""
    
    file = str(file)
    
    try:
        if mode=="JSON":
            if not file.endswith(".json"):
                file += ".json"
            with open(file, 'wb') as outfile:
                json.dump(dicts_list, outfile)
            #pickle.dump(dicts_list, outfile)
                print(dicts_list + " successfully saved to " + file + "in JSON format!\n")

        elif mode=="pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
            with open(file, 'wb') as outfile:
                pickle.dump(dicts_list, outfile)
                print(dicts_list + " successfully saved to " + file + " in pickle format!\n")

        else:
            print("ERROR! Save failed due to improper arguments. These are: file, object to be saved, and file format to save in.\n\
                  Specify either 'JSON' or 'pickle' as third argument ('mode' or file format) when calling this function.")
    
    except Exception as e:
        print(e)
    

def load_file(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    with open(file,'rb') as infile:
        if file.endswith(".json"):
            var = json.load(infile)
        if file.endswith(".pickle"):
            var = pickle.load(infile)
        print(file + " successfully loaded!\n")
    return var


In [8]:
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    visible_text = list(elem.replace("\t","") for elem in visible_text.split(random_string)) # Split text into list using random string while eliminating tabs
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)


In [14]:
if Debug:
    example_textlist = parsefile_by_tags(example_file)
    print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")


Output of parsefile_by_tags:

 ['þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\x19µÜyI\x1a¤º\x01®/Õxá5\x12\x83\x0c\x183À&\x12\x87÷á;¹§^Ó8¨¦j·þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\x19µÜyI\x1a¤º\x01®/Õxá5\x12\x83\x0c\x183À&\x12\x87÷á;¹§^Ó8¨¦j·þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\x19µÜyI\x1a¤º\x01®/Õxá5\x12\x83\x0c\x183À&\x12\x87÷á;¹§^Ó8¨¦j·þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\x19µÜyI\x1a¤º\x01®/Õxá5\x12\x83\x0c\x183À&\x12\x87÷á;¹§^Ó8¨¦j·þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\x19µÜyI\x1a¤º\x01®/Õxá5\x12\x83\x0c\x183À&\x12\x87÷á;¹§^Ó8¨¦j·þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\x19µÜyI\x1a¤º\x01®/Õxá5\x12\x83\x0c\x183À&\x12\x87÷á;¹§^Ó8¨¦j·þµ\x8f\x10"\x963\x8d\x96¢î\'ù6V\x94ð~~µÐ\x08ÏØZÞH\x1b\x82\x14ÿL\x0b\x1e\x05\x14\x91½\

In [80]:
def parse_remove_overlaps(folder_path, schoolname):
    
    """Filters out overlapping parts of each page, returning combined list of non-redundant text elements from all pages.
    Removes superfluous text elements common to multiple pages at either the beginning or end: namely,
    this eliminates menus, headers, and footers, which are redundant elements not useful for analysis. 
    
    We do this by making one pass through all the pages, in which we store text and eliminate duplicate pages.
    We store the text both forwards (start to end) and backwards (end to start), 
    in order to find duplicate headers (or menus) and footers, respectively.
    Then, we do a second pass through the text to find the longest common headers and footers and remove them from each page.
    
    TO DO: Look for overlap matches only between current page [i] and another page [j] IF page [j] hasn't been parsed yet. 
    (Whereas currently each page i searches for overlap matches with every other page j, even if page j has already been used for this
    and therefore any overlaps would have already been removed. Two-way parsing may be what we're after?)
    Hopefully, this will eliminate redundant comparisons and thereby make function faster!
    Probably best way to do this is by removing pages[i] after it's been used as a comparison. But I'm not sure."""
    
    #num_pages = len([f for f in os.listdir(folder_path) if f[0] != "."])
    list_pages = [file for file in glob(folder_path + "**", recursive=True) if file.endswith(".html")] # Keep only HTML files
    num_pages = len(list_pages)
    pages, reverse_pages, pages_set, filtered_site_text = [], [], set(), []
    
    # First iterate through each page to find overlaps
    for file in list_pages:

        if Debug:
            print("  Now reading this HTML page to use in overlap search: " + file)
            
        page_text = "\n".join(parsefile_by_tags(file)) # Parse HTML in file, join into string via newlines

        # Check if page is a duplicate. If so, store it only as an empty file. 
        # As compared to ignoring these pages completely, storing as empty preserves the num_pages count.
        if page_text in pages_set:
            pages.append("")
            reverse_pages.append("")
            continue

        pages.append(page_text) # Save pages in this list going forwards, from start to end. Use this to eliminate headers
        reverse_pages.append(page_text[::-1]) # Save pages in this list going forwards, from end to start. Use this to eliminate footers
        pages_set.add(page_text) # Save pages to a set. Use this to check for duplicates

    # Iterate again through each page to only keep non-overlapping phrases
    for i in range(len(pages)):
        
        try:
            # Initialize indices: start_index indicates how many characters into a page, 
            #  from start to end, is shared with at least one other page (forwards). 
            # end_index indicates the same, but from end to start (backwards).
            start_index, end_index = -1, -1 
            filtered_page_text = "" # Initialize text string holding the full site text with all overlaps removed from each page

            filtered_page_text = pages[i]

            # Find longest number of characters shared by this file and at least one other by going forwards, i.e. from start to end:
            start_index = max([len(os.path.commonprefix([pages[i], pages[j]])) \
                               for j in range(len(pages)) if i != j]) #? and j not in overlaps_removed_list
            if start_index != 0:
                # Round down start_index to beginning of current word
                while start_index > 0 and filtered_page_text[start_index] != " ":
                    start_index -=1
                if filtered_page_text[start_index] == " ":
                    start_index += 1
                filtered_page_text = filtered_page_text[start_index:]

            # Find longest number of characters shared by this file and at least one other by going backwards, i.e. from end to start:
            end_index = max([len(os.path.commonprefix([reverse_pages[i], reverse_pages[j]]))\
                             for j in range(len(reverse_pages)) if i != j])
            if end_index != 0:
                # Round up end_index to end of current word
                while end_index >= 0 and filtered_page_text[-end_index] != " ":
                    end_index -=1
                filtered_page_text = filtered_page_text[:-end_index]

            if Debug and filtered_page_text != "":
                print("\n    Removed overlaps for HTML page #" + str(i) + " of " + str(len(pages)) + ", which is probably " + \
                "this file:\n    " + str(list_pages[i]) + "\n    and contains the following text:\n")
                print(filtered_page_text)

            filtered_site_text.extend(filtered_page_text + " ") # Add the filtered_page_text (all overlaps removed) to long string of site text
        
        except Exception as e:
            print("    ERROR! Encountered this exception:",e)

    print("Successfully removed menus, headers, and footers and merged all HTML pages for " + str(schoolname) + "!\n")
    return filtered_site_text


In [81]:
if Debug:
    print("Output of remove_overlaps:\n\n", remove_overlaps(example_folder, example_schoolname), "\n\n")
    

KeyboardInterrupt: 

In [82]:
def parse_school_overlaps(school_dict, school_name, school_address, school_URL, datalocation, parsed, itervar, numschools):
    
    """Parses webtext for a given school, using helper functions to run analyses and then saves to school_dict
    parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps()."""
    
    itervar+=1
    print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    school_dict["webtext_nooverlaps"], school_dict["duplicate_flag"], school_dict["parse_error_flag"] = [], 0, 0
    
    folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
    school_folder = datalocation + folder_name + "/"
    
    if school_URL not in parsed: #check if this URL has already been parsed. If so, skip this school to avoid duplication bias
        parsed.append(school_URL)
    
        try:
            # Filter out overlapping parts of each page, save full string in dict element "parsed_text":
            school_dict["webtext_nooverlaps"] = parse_remove_overlaps(school_folder, folder_name)
            
        except Exception as e:
            print("    ERROR! Failed to remove overlaps while parsing webtext of " + str(school_name))
            print("    ",e)
            school_dict["parse_error_flag"] = 1
            return
    
    else:
        print("DUPLICATE URL DETECTED. Skipping " + str(school_name) + "...\n\n")
        school_dict["duplicate_flag"] = 1
        return
    
    print("SUCCESS! Parsed, categorized, and removed overlaps from website text for " + str(school_name) + "...\n\n")
    return


In [83]:
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile:
    dicts_list = load_file(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    if Debug:
        data_loc = micro_sample13 # Run on micro-sample first, for debugging purposes
    else:
        data_loc = full_schooldata # Run at scale using URL list of full charter population
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
        
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables

In [84]:
# ### Run parsing algorithm on schools

test_dicts = dicts_list[:1] # Limit number of schools to analyze, in order to refine methods

if Debug:
    for school in test_dicts:
        parse_school_overlaps(school, school[NAME_var], school[ADDR_var], school[URL_var], data_loc, parsed, itervar, len(dicts_list))
        
else:
    for school in dicts_list:
        parse_school_overlaps(school, school[NAME_var], school[ADDR_var], school[URL_var], data_loc, parsed, itervar, len(dicts_list))

Parsing RICHLAND TWO CHARTER HIGH, which is school #1 of 300...
Successfully removed menus, headers, and footers and merged all HTML pages for RICHLAND_TWO_CHARTER_HIGH_MB!

SUCCESS! Parsed, categorized, and removed overlaps from website text for RICHLAND TWO CHARTER HIGH...




In [85]:
# Check out results:
if Debug:
    print(test_dicts[0])
else:
    print(dicts_list[0])
    

OrderedDict([('MS_ID', '1'), ('LEANM', 'RICHLAND 02'), ('SEARCH', 'RICHLAND TWO CHARTER HIGH 750 OLD CLEMSON RD, COLUMBIA, SC'), ('CER_NAME', 'Richland Two Charter High School'), ('URL', 'https://www.richland2.org/charterhigh/'), ('ADDRESS', '750 OLD CLEMSON RD, COLUMBIA, SC'), ('CUSTOM_ID', 'SC600'), ('LEVEL', '3'), ('YEAR_OPEN_CER', '2010'), ('CER_MS_2012', 'Flexible schedule allows students to work and explore different careers while receiving their high school diploma.'), ('SURVYEAR', '2013'), ('NCESSCH', '4.50E+11'), ('FIPST', '45'), ('LEAID', '4503390'), ('SCHNO', '1554'), ('STID', '4002'), ('SEASCH', '600'), ('SCHNAM', 'RICHLAND TWO CHARTER HIGH'), ('PHONE', '8034191348'), ('MSTREE', '750 OLD CLEMSON ROAD'), ('MCITY', 'COLUMBIA'), ('MSTATE', 'SC'), ('MZIP', '29229'), ('MZIP4', '0'), ('LSTREE', '750 OLD CLEMSON RD'), ('LCITY', 'COLUMBIA'), ('LSTATE', 'SC'), ('LZIP', '29229'), ('LZIP4', '0'), ('TYPE', '1'), ('STATUS', '1'), ('UNION', '0'), ('ULOCAL', '21'), ('LATCOD', '34.1231'), 

In [None]:
# Save output:
if Debug:
    dictfile = "testing_dicts_" + str(datetime.today())
    save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
    dictfile = "school_dicts_" + str(datetime.today())
    save_to_file(dicts_list, save_dir+dictfile, "JSON")
