# Testing Scrapy Cluster data

In [1]:
# ## Initializing

# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
from glob import glob,iglob # for finding files within nested folders--compare with os.walk
import json, pickle, csv # For saving a loading dictionaries, DataFrames, lists, etc. in JSON, pickle, and CSV formats
from math import log10 # For calculating logarithms of dictionary counts
from datetime import datetime # For timestamping files
import time #, timeout_decorator # To prevent troublesome files from bottlenecking the parsing process, use timeouts
import sys # For working with user input
import logging # for logging output, to help with troubleshooting
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import pandas as pd # modifies data more efficiently than with a list of dicts
from tqdm import tqdm # For progress information over iterations, including with Pandas operations via "progress_apply"

In [2]:
# ### Set script options

Debug = False # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML

In [3]:
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\" # One level further down than the others
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

save_dir = dir_prefix + "Charter-school-identities" + os.sep + "data" + os.sep # Directory in which to save data files
dicts_dir = dir_prefix + "Charter-school-identities" + os.sep + "dicts" + os.sep # Directory in which to find & save dictionary files
temp_dir = save_dir + "temp" + os.sep # Directory in which to save temporary data files

scfile = dir_prefix + "scrapy_cluster_data" + os.sep + "processed_df.pkl"
micro_sample13 = save_dir + "micro-sample13_coded.csv" # Random micro-sample of 300 US charter schools
URL_schooldata = save_dir + "charter_URLs_2014.csv" # 2014 population of 6,973 US charter schools
full_schooldata = save_dir + "charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
temp_data = save_dir + "school_parser_temp.json" # Full_schooldata dict with output for some schools
example_file = save_dir + "example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

if not workstation and not notebook:
    wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel (requires server access)
    example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/" # Random charter school folder
    example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"
    
data_year = int(2014)
    
# Set logging options
log_file = temp_dir + "data_prep_" + str(datetime.today()) + ".log"
logging.basicConfig(filename=log_file,level=logging.INFO)

In [4]:
schooldf = pd.read_pickle(scfile)

In [33]:
print([elem for elem in schooldf.loc[schooldf["URL"] == "http://www.artsacademyatscottsdale.com/", "data"]])

[[('http://www.artsacademyatscottsdale.com/', 'False', '0', "Welcome!\nABOUT US \nOur Philosophy\nWHY CHOOSE US?\nYou Can Help\nSTUDENT ART GALLERY\nPHOTO GALLERY\nBoard & Financials\nPARTNER WITH US \nPARTNERSHIP OPPORTUNITIES\nPRESCHOOL \nABOUT OUR PRESCHOOL\nPreschool Forms\nREGISTRATION \nNEW STUDENT REGISTRATION\nRETURNING STUDENTS\nDOCUMENTS\nPARENT TOOLS \nFORMS & DOCUMENTS\nSCHOOL CALENDAR\nREPETTO'S REPORT\nCLASSROOM NEWSLETTERS\nWelcome!\nABOUT US\nOur Philosophy\nWHY CHOOSE US?\nYou Can Help\nSTUDENT ART GALLERY\nPHOTO GALLERY\nBoard & Financials\nPARTNER WITH US\nPARTNERSHIP OPPORTUNITIES\nPRESCHOOL\nABOUT OUR PRESCHOOL\nPreschool Forms\nREGISTRATION\nNEW STUDENT REGISTRATION\nRETURNING STUDENTS\nDOCUMENTS\nPARENT TOOLS\nFORMS & DOCUMENTS\nSCHOOL CALENDAR\nREPETTO'S REPORT\nCLASSROOM NEWSLETTERS\n Search\n2018 AASC Bobcat BASH!\nAnnual Live Auction\nApril 28, 2018 6pm-9pm at Goldie's Sports Cafe\nJoin us for for an evening of fun, friendship and fundraising to benefit our s

In [24]:
print([URL for URL in schooldf.URL])

['http://ayaprun.lksd.org/', 'http://www.tongassschool.org/', 'https://www.kgbsd.org/ketchikancharter', 'https://paideia.asdk12.org/', 'http://anccs.asdk12.org/', 'https://rilkeschule.asdk12.org/', 'http://highlandacademy.asdk12.org/', 'http://www.frontiercs.org/', 'http://www.winterberrycharterschool.com/', 'https://familypartnership.asdk12.org/', 'https://aquarian.asdk12.org/', 'http://www.asdk12.org/aboutschools/eagleacademy/', 'http://juneaucharterschool.org/', 'http://kaleidoscope.blogs.kpbsd.k12.ak.us/', 'http://soldotnaelementary.blogs.kpbsd.k12.ak.us/', 'http://fireweedacademy.blogs.kpbsd.k12.ak.us/', 'http://auroraborealis.blogs.kpbsd.k12.ak.us/', 'http://www.birchtreecharterschool.org/', 'https://www.matsuk12.us/frc', 'http://twindlybridge.us/', 'http://www.matsuk12.us/amc', 'http://www.midnightsunschool.com/', 'https://www.matsuk12.us/acc', 'http://nome.nosd.schoolaccess.net/~acsa/', 'https://www.k12northstar.org/Page/78', 'http://ekc.k12northstar.org/', 'https://www.k12nort

In [25]:
schooldf["SCH_NAME_x"]

0                                     Ayaprun Elitnaurvik
1       Tongass School of Arts and Sciences Charter Sc...
2                                Ketchikan Charter School
3                              PAIDEIA Cooperative School
4                   Alaska Native Cultural Charter School
5                             Rilke Schule Charter School
6                       Highland Tech High Charter School
7                                 Frontier Charter School
8                                      Winterberry School
9                       Family Partnership Charter School
10                                Aquarian Charter School
11                           Eagle Academy Charter School
12                        Juneau Community Charter School
13                  Kaleidoscope School of Arts & Science
14                     Soldotna Montessori Charter School
15                                       Fireweed Academy
16                         Aurora Borealis Charter School
17            

In [29]:
print([column for column in schooldf.columns])

['Unnamed: 0', 'CHARTAUTHN2', 'MSTREET2', 'AE', 'MSTATE', 'LSTREET114', 'ASALF', 'TOTFRL', 'BIES', 'SCH_TYPE_TEXT', 'SCH_TYPE', 'CHARTAUTH1', 'CHARTR00', 'CHARTAUTHN1', 'SCHNAM13', 'ST_SCHID', 'MSTREET1', 'NOTES', 'ALL_MTH00NUMVALID_1415', 'TOTAL', 'CONUM', 'NCESSCH', 'UNION', 'LAT1516', 'STATUS98', 'OPSTFIPS', 'CBSA', 'LZIP', 'WHALM', 'LEVEL', 'SY_STATUS', 'CD15', 'CNTY15', 'WHALF', 'CHARTR03', 'STATUS12', 'NSLPSTATUS_TEXT', 'ST_LEAID', 'BL', 'CHARTR07', 'MCITY', 'MSTREET3', 'NECTA', 'STFIP15', 'VIRTUAL', 'ISFTEPUP', 'ASALM', 'LSTREET214', 'CONFIRMED_CLOSED', 'ADDRESS14', 'NOGRADES', 'NECTA15', 'CHARTR11', 'CBSA15', 'GSHI', 'OUT_OF_STATE_FLAG', 'SY_STATUS16', 'STABR_x', 'EFFECTIVE_DATE', 'CHARTR13', 'ALL_MTH00PCTPROF_1415', 'LZIP14', 'ISFLE', 'CHARTR01', 'STATUS09', 'AMALF', 'RECON_STATUS15', 'HI', 'CHARTR02', 'CHARTR10', 'MZIP', 'LCITY', 'CHARTER_TEXT16', 'LSTREET314', 'SCHNAM98', 'UGOFFERED', 'SCHNAM09', 'LSTREE', 'UPDATED_STATUS_TEXT', 'AMALM', 'MANUAL_URL', 'SLDU15', 'CHARTR05', '

In [34]:
print("Welcome!\nABOUT US \nOur Philosophy\nWHY CHOOSE US?\nYou Can Help\nSTUDENT ART GALLERY\nPHOTO GALLERY\nBoard & Financials\nPARTNER WITH US \nPARTNERSHIP OPPORTUNITIES\nPRESCHOOL \nABOUT OUR PRESCHOOL\nPreschool Forms\nREGISTRATION \nNEW STUDENT REGISTRATION\nRETURNING STUDENTS\nDOCUMENTS\nPARENT TOOLS \nFORMS & DOCUMENTS\nSCHOOL CALENDAR\nREPETTO'S REPORT\nCLASSROOM NEWSLETTERS\nWelcome!\nABOUT US\nOur Philosophy\nWHY CHOOSE US?\nYou Can Help\nSTUDENT ART GALLERY\nPHOTO GALLERY\nBoard & Financials\nPARTNER WITH US\nPARTNERSHIP OPPORTUNITIES\nPRESCHOOL\nABOUT OUR PRESCHOOL\nPreschool Forms\nREGISTRATION\nNEW STUDENT REGISTRATION\nRETURNING STUDENTS\nDOCUMENTS\nPARENT TOOLS\nFORMS & DOCUMENTS\nSCHOOL CALENDAR\nREPETTO'S REPORT\nCLASSROOM NEWSLETTERS\n Search\nOUR PHILOSOPHY\nOUR VISION\nTo instill a desire in students to become life-long learners empowered with the ability to have a positive effect on the global community.\nOUR MISSION\nTo establish a Student-Family-Teacher-Civic relationship that develops and sustains a Professional Learning Community where arts and technology combine with academics and real life skills to educate and enrich the Whole Individual.\nOUR VALUES\nPLC Arts Academy at Scottsdale is committed to providing a learning environment rich in the arts, where students achieve academic and social excellence while solving real-life problems in a cooperative manner.\nWe Strive to teach all children the importance, value and worth of:\nCare for Self: Integrity - Honesty - Respect\nCare for Others: Kindness - Dependability\nCare for Community: Civic Responsibility\nOPERATING PRINCIPLES\nWe believe that every student has the ability and the right to learn. We believe that learning takes place best when educators, students, and parents share a common set of educational values and goals.\nWe believe and are committed to the application of technology as a tool in every classroom.\nWe believe our academic and arts programs will develop the inner disciplines and individual responsibilities needed to live productive lives.\nWe believe the self-esteem and individuality of a student will be developed as their artistic gifts are fine-tuned through various arts programs.\nWe believe that positive interactions and positive feedback from staff to students is the best way to encourage appropriate and desirable behavior and for promoting the development of good habits.\nContact:\naasc@\nplccharterschools.org\n6140 E. Thunderbird Rd.\nScottsdale, AZ 85254\nO: (480) 951-3190\nF: (480) 998-4029\nMrs. Carolyn Repetto, Principal\nMrs. Kim Steele, CEO & Preschool\nHours of Operation:\nOffice: 7:30AM – 4:00PM, M-F Closed on major holidays\nSchool: 8:00AM – 3:00PM\nClubhouse: 2:20PM – 6:00PM\nSummer,Fall & Winter Break \nOffice Hours: \n9:00AM-2:00PM, M-F \nClosed on major holidays\n.\n©\xa02014 PLC Arts Academy at Scottsdale. All Rights Reserved.\xa0\nCall Us\nFind Us\nEmail Us\nLike Us")

Welcome!
ABOUT US 
Our Philosophy
WHY CHOOSE US?
You Can Help
STUDENT ART GALLERY
PHOTO GALLERY
Board & Financials
PARTNER WITH US 
PARTNERSHIP OPPORTUNITIES
PRESCHOOL 
ABOUT OUR PRESCHOOL
Preschool Forms
REGISTRATION 
NEW STUDENT REGISTRATION
RETURNING STUDENTS
DOCUMENTS
PARENT TOOLS 
FORMS & DOCUMENTS
SCHOOL CALENDAR
REPETTO'S REPORT
CLASSROOM NEWSLETTERS
Welcome!
ABOUT US
Our Philosophy
WHY CHOOSE US?
You Can Help
STUDENT ART GALLERY
PHOTO GALLERY
Board & Financials
PARTNER WITH US
PARTNERSHIP OPPORTUNITIES
PRESCHOOL
ABOUT OUR PRESCHOOL
Preschool Forms
REGISTRATION
NEW STUDENT REGISTRATION
RETURNING STUDENTS
DOCUMENTS
PARENT TOOLS
FORMS & DOCUMENTS
SCHOOL CALENDAR
REPETTO'S REPORT
CLASSROOM NEWSLETTERS
 Search
OUR PHILOSOPHY
OUR VISION
To instill a desire in students to become life-long learners empowered with the ability to have a positive effect on the global community.
OUR MISSION
To establish a Student-Family-Teacher-Civic relationship that develops and sustains a Professional L

In [1]:
print("\t\t\t\tHome\t\t\t\n\t\t\t\tOur School\t\t\t\n\t\t\tMission and Vision\t\t\t\n\t\t\tBell Schedule\t\t\t\n\t\t\tMeet our Staff\t\t\t\n\t\t\tSARC\t\t\t\n\t\t\tFood Services\t\t\t\n\t\t\tInstructional Programs\t\t\t\n\t\t\tEmergency Information\t\t\t\n\t\t\tWASC\t\t\t\n\t\t\t\tCounseling\t\t\t\n\t\t\tCollege & Career Readiness\t\t\t\n\t\t\t\tParents\t\t\t\n\t\t\tCommunity Service\t\t\t\n\t\t\tStudent Handbook\t\t\t\n\t\t\tNewsletter\t\t\n>\t\n\t\t\tPast Newsletters\t\t\t\n\t\t\t\tHome Study\t\t\t\n\t\t\t\tContact Us\t\t\t\n\t\t\tMap\t\t\t\n\t\t\t\tMiddle School & High School\t\t\t\n\ufeff\nMission and Vision\n\ufeff\nStony Point Academy is committed to provide a rigorous preparatory program that ensures all students are ready for successful post-secondary pathway. This includes an academically rich curriculum with rigorous content. It includes an expectation that students will apply their knowledge through higher-order skills, and will develop the habits of mind and character traits known to support personal standards within a student-centered environment that emphasizes the Common Core Anchor Standards for College and Career Readiness (CCR) across all content areas at every grade level. It is with this preparation that students will become contributing members of the local and global communities.\nStony Point Academy\nBuilding Future Leaders\nJob Opportunities\nContact us\nWebsite Issues\n        \t\t\t\tHome        \t\t\t\n        \t\t\t\tOur School        \t\t\t\n\t\t\tMission and Vision\t\t\t\n\t\t\tBell Schedule\t\t\t\n\t\t\tMeet our Staff\t\t\t\n\t\t\tSARC\t\t\t\n\t\t\tFood Services\t\t\t\n\t\t\tInstructional Programs\t\t\t\n\t\t\tEmergency Information\t\t\t\n\t\t\tWASC\t\t\t\n        \t\t\t\tCounseling        \t\t\t\n\t\t\tCollege & Career Readiness\t\t\t\n        \t\t\t\tParents        \t\t\t\n\t\t\tCommunity Service\t\t\t\n\t\t\tStudent Handbook\t\t\t\n\t\t\tNewsletter\t\t\n>\t\n\t\t\tPast Newsletters\t\t\t\n        \t\t\t\tHome Study        \t\t\t\n        \t\t\t\tContact Us        \t\t\t\n\t\t\tMap\t\t\t\n        \t\t\t\tMiddle School & High School        \t\t\t\n\u2715")

				Home			
				Our School			
			Mission and Vision			
			Bell Schedule			
			Meet our Staff			
			SARC			
			Food Services			
			Instructional Programs			
			Emergency Information			
			WASC			
				Counseling			
			College & Career Readiness			
				Parents			
			Community Service			
			Student Handbook			
			Newsletter		
>	
			Past Newsletters			
				Home Study			
				Contact Us			
			Map			
				Middle School & High School			
﻿
Mission and Vision
﻿
Stony Point Academy is committed to provide a rigorous preparatory program that ensures all students are ready for successful post-secondary pathway. This includes an academically rich curriculum with rigorous content. It includes an expectation that students will apply their knowledge through higher-order skills, and will develop the habits of mind and character traits known to support personal standards within a student-centered environment that emphasizes the Common Core Anchor Standards for College and Career Readiness (CCR) across all 