In [21]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Parsing HTML from `wget` run!

## Initializing

In [86]:
#import necessary libraries
import os, re #for navigating file trees and working with strings
import csv #for reading in CSV files
from glob import glob #for finding files within nested folders
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request #for testing pages

#Import parser
from bs4 import BeautifulSoup #BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment #helps with detecting inline/junk tags when parsing with BS
import lxml #for fast HTML parsing with BS

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] #this list helps with eliminating junk tags when parsing HTML

In [59]:
#set directories and parsing keywords
wget_dataloc = "/vol_b/data/wget/parll_wget/" #data location for schools downloaded with wget in parallel
micro_sample13 = "/vol_b/data/Charter-school-identities/data/micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
full_schooldata = "/vol_b/data/Charter-school-identities/data/charter_URLs_2014.csv" #data location for 2014 population of US charter schools

example_page = "https://westlakecharter.com/about/"

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission',' vision ', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in self_keywords)

In [159]:
# ### Define helper functions
def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==full_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print(e)
            print("ERROR: No data source established!\n")
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one we wouldn't read and thus don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True



def filterwebsite(string):
    stringList = re.split(r'\s{3,}', string)
    validcharacters = [' ', '.', '?']
    filteredtext = []  
    
    for string in stringList:
        lowercasestring = string.lower()
        for key in keywords:
            if key in lowercasestring:
                if key in lowercasestring.split(' '): #check that the word is the whole word not part of another one
                    filteredtext.extend([string.lower()])

    filteredtext = list(set(filteredtext))
    finaltext = []
    for x in filteredtext:
        finaltext.append(x.replace('\xa0', " "))
    return finaltext


def webtext_from_files(datalocation):
    """Load webtext from text files in local storage."""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string

In [166]:
# ### Define and compare parsing approaches

def parse_by_newlines(urlstring):
    """Uses BS to parse HTML from a given URL and looks for three newlines to separate chunks of text."""
    
    # Read HTML from a given url:
    with urllib.request.urlopen(urlstring) as url:
        s = url.read()
    
    # Parse raw text from website body:
    soup = BeautifulSoup(s, "html.parser")
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    webtext = u" ".join(t.strip() for t in visible_texts)
    
    return re.split(r'\s{3,}', webtext)


def parse_by_tags(urlstring):
    """This function cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks."""
    
    with urllib.request.urlopen(urlstring) as url:
        HTML_page = url.read()

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(HTML_page, "html.parser")
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    visible_text = list(elem.replace("\t","") for elem in visible_text.split(random_string)) # Split text into list using random string while eliminating tabs
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline by prepending with: "\n".join
    
    return(visible_text)

In [169]:
# Parsing by tags has superior chunking accuracy:
print(parse_by_newlines(example_page),"\n\n",parse_by_tags(example_page))

['', 'Contact Us: 2680 Mabry Dr. 95835 (916) 567-5760 | admin@westlakecharter.com', 'Staff Login', 'About', 'Administration & Support Staff  Employment Opportunities  Business Services  Admission & Enrollment Information  Supply Donations  Hot Lunch Payments  Hot Lunch Menu', 'Board', 'Board Policies  Board Committees  Board Documents', 'WCS', 'Infinite Campus Login  School Dismissal Manager  Ways to Westlake', 'Teachers  BASE  WAVE', 'Meetings  Contact Us  Volunteer Opportunities  Volunteer Hours  Log Volunteer Hours  Spirit Store', 'Calendars', '17/18 School Calendar  18/19 School Calendar', 'About', 'About  Aimee Wells  2013-09-18T18:36:35+00:00', 'About Westlake Charter Schools  Key School Features', 'International Focus  Thematic Curriculum  Artistic Development  Foreign Language Instruction  School-Wide Enrichment Model', 'Core Values', 'Respect  Excellence  Responsibility  Reflective  Global Perspective  Stewardship  Perseverance  Inquisitive  Joyful Learning  Gratitude', 'Calen

In [90]:
def score_and_categorize(pagetext_list): 
    """Takes in a list of all the relevant (filtered) text from a given webpage. 
    Categorizes each block of text by scoring based on keyword count, using already-defined lists 
    of keywords per category--mission, philosophy, curriculum, history, and about/general self-description."""
    
    mission_list = []
    curriculum_list = []
    philosophy_list = []
    history_list = []
    about_list = []
    
    for string in pagetext_list:
        mission_score, curriculum_score, philosophy_score, history_score, about_score = 0, 0, 0, 0, 0
        for word in mission_keywords:
            mission_score+=string.count(word)
            if 'mission' in string.lower():
                mission_score = 2
                
        for word in curriculum_keywords:
            curriculum_score+=string.count(word)
            if 'curriculum' in string.lower():
                curriculum_score = 2
                
        for word in philosophy_keywords:
            philosophy_score+=string.count(word)
            if 'philosophy' in string.lower() or 'value' in string.lower():
                philosophy_score = 2
        
        for word in history_keywords:
            history_score+=string.count(word)
            if 'history' in string.lower():
                history_score = 2
        
        for word in about_keywords:
            self_score+=string.count(word)
            if 'about us' in string.lower() or "about-us" in string.lower():
                about_score = 2
        
        if mission_score>=2:
            mission_list.append(string)
        if curriculum_score>=2:
            curriculum_list.append(string)
        if philosophy_score>=2:
            philosophy_list.append(string)
        if history_score>=2:
            history_list.append(string)
        if about_score>=2:
            about_list.append(string)
        elif (mission_score + curriculum_score + philosophy_score + history_score + about_score >=2):
            about_list.append(string)
        
    #return {'mission': mission_list, 'curriculum' : curriculum_list, 'philosophy': philosophy_list, 'history': history_list, 'about': about_list}
    return mission_list, curriculum_list, philosophy_list, history_list, about_list


In [None]:
def score_school(school_dict, school_name, school_address, school_URL, datalocation, iter, numschools):
    """Description here"""
    
    iter+=1
    print("Parsing " + str(school_name) + ", which is school #" + str(iter) + " of " + str(numschools) + "...")
    
    school_dict['mission'],school_dict['curriculum'],school_dict['philosophy'],school_dict['history'],school_dict['about'] = "","","","",""
    school_dict['mission_best'],school_dict['curriculum_best'],school_dict['philosophy_best'],school_dict['history_best'],school_dict['about_best'] = "","","","",""
    school_dict["webtext"], school_dict["parsed_text"], school_dict["duplicate_flag"] = "", "", 0
    
    folder_name = re.sub(" ","_",(name+" "+address[-8:-6]))
    school_folder = datalocation + folder_name + "/"
    
    if school_URL not in parsed: #check if this URL has already been parsed. If so, skip this school to avoid duplication bias
        parsed+=school_URL
        for file in glob(school_folder + "*/"+"**", recursive=True):
            if file.endswith("html"): #Parse file only if it contains HTML. This is easy: wget has already adjusted extensions appropriately for this
            #if bool(BeautifulSoup(open(fname), "html.parser").find())==True: #More inefficient way to check if file contains HTML, for data not downloaded by wget
                print("    Parsing HTML in " + str(file) + "...")
                try:
                    #parse and filter page text here
                    filtered_text = 
                    school_dict['mission'],school_dict['curriculum'],school_dict['philosophy'],school_dict['history'],school_dict['about'] += score_and_categorize(filtered_text)
                except Exception as e:
                    print("    ",e)
                    print("    ERROR! No webtext gathered for " + str(school_name))
    
    else:
        print("DUPLICATE URL DETECTED. Skipping " + str(school_name) + "...\n\n")
        school_dict["duplicate_flag"] = 1
        return
    
    print("SUCCESS! Parsed and stored website text for " + str(school_name) + "...\n\n")
    return


In [None]:
# ### Preparing data to be parsed

#set charter school data file and corresponding varnames
#data_loc = full_schooldata #run at scale using URL list of full charter population
data_loc = micro_sample13[:10] #run on micro-sample first, for debugging purposes
URL_var,NAME_var,ADDR_var = get_vars(dataloc) #get varnames depending on data source

itervar = 0 #initialize iterator that counts number of schools already parsed
parsed = [] #initialize list of URLs that have already been parsed
dicts_list = [] #initialize list of dictionaries to hold school data

#Create dict list from CSV on file, with one dict per school
with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
    reader = csv.DictReader(csvfile) # create a reader
    for row in reader: # loop through rows
        dicts_list.append(row) # append each row to the list
        
#Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
#This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables

In [None]:
for school in dicts_list:
    score_school(school, school[NAME_var], school[ADDR_var], school[URL_var], data_loc, itervar, len(dicts_list))

In [92]:
school_dict = score_and_categorize(westlakelist)
school_dict['philosophy']

['westlake charter school is a k-8th grade public elementary school created by parents and educators in natomas. we opened our doors in 2005 and continue to grow serving more and more students each year. wcs’s primary mission is to demonstrate what is possible when school and community collaborate to create inspiring adults with the academic and social-emotional readiness to lead as global citizens.  westlake charter school students are explorers! our curriculum focuses on diversity and appreciation of different cultures, while promoting academic excellence and foreign language acquisition. we offer various specialty classes to all of our students including art, physical education, and spanish. our middle school program focuses on math, science and technology and is executed through the use of technology. westlake charter school has small class sizes of approximately 22-24 students in k-3rd grade and 29 students in 4th-8th grade.  first time visitors to westlake charter school often re