## Creating schools.csv

1. Install packages
2. Create cities.csv with full state name/ city column to use in getting school information
3. For persisitance creating a schools csv using selenium to get school information from greatschools.org
4. Clean csv for use in schools endpoint

### 1. Import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import os
import requests
import csv
import json
import pandas as pd
from state_abbr import us_state_abbrev as abbr
from selenium import webdriver
import urllib.parse
import re
import html5lib



### 2.  Create cities.csv with full state name/ city column to use in getting school information

In [None]:
# pwd = os.getcwd()

# # create city state list
# cities = pd.read_excel('notebooks/datasets/data/schools/csv/List of Cities.xlsx')

# # just get the second and third colun
# cities = cities[['Unnamed: 1','Unnamed: 2']]

# # create new dictionary with reversed key, value pairs
# full = dict(map(reversed, abbr.items()))

# # map state abbreviations to full name
# cities['states'] = cities['Unnamed: 2'].map(full)

# # making sure state/city combo conform to url format of "-" for " "
# cities['states'] = cities['states'].str.strip()
# cities['states'] = cities['states'].str.replace(" ", "-")
# cities['Unnamed: 1'] = cities['Unnamed: 1'].str.replace(" ", "-")

# # remove extraneous header rows
# cities = cities.iloc[2:]
# cities['city'] = (cities['states'] + '/'+ cities['Unnamed: 1']).str.lower()
# print(cities.head())

# # persist by creating new csv
# cities.to_csv('notebooks/datasets/data/schools/csv/cities.csv')

### 3. For persisitance creating a schools csv using selenium and Beautiful Soup to get school information from greatschools.org

In [11]:
# Capturing the information for the remaining table separately
# Looping through each city in the file
cities = pd.read_csv('csv/cities.csv')

# selenium driver
driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True

  records = []

  page = 0

  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        page_status_text = page_status.text.strip()
        print(page_status_text)
        page_status_regex = re.search(r".* (\d+) to (\d+) of (\d+)", page_status_text)
        beginning, ending, total = page_status_regex.groups()
        if int(ending) >= int(total):
            fetching = False
        table = soup.find("table", { "class" : "" })
        for row in table.find_all("tr"):
            cell = row.find_all("td")
            if len(cell) == 7:
                school = row.find('a', {'class':'name'}).text.strip()
                print(school)
                try:
                    score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
                    print(score)
                except AttributeError:
                    score = '0/10'
                rating = row.find('div', {'class': 'scale'}).text.strip()
                print(rating)
                address = row.find('div', {'class': 'address'}).text.strip()
                school_type = cell[1].find(text=True)
                grade =  cell[2].find(text=True)
                print(grade)
                students =  cell[3].find(text=True)
                print(students)
                student_teacher_ratio =  cell[4].find(text=True)
                try: 
                    district =  cell[6].find(text=True)
                except AttributeError:
                    district = 'Unavailable'

            records.append({ 
                'School': school, 
                'Score': score, 
                'Rating': rating, 
                'Address': address, 
                'Type': school_type,
                'Grades' : grade,
                'Total Students Enrolled': students,
                'Students per teacher' : student_teacher_ratio, 
                'District': district
                })

driver.close()

Fetching  http://www.greatschools.org/pennsylvania/bethlehem/schools/?page=1&tableView=Overview&view=table
Showing 1 to 25 of 111 schools found in Bethlehem, PA
Hanover El School
8/10
Above average
K-5
244
Calypso El School
7/10
Above average
PK-5
243
Lehigh Valley Charter High School for the Arts
7/10
Above average
9-12
627
Thomas Jefferson El School
6/10
Average
K-5
238
Lehigh Valley Academy Regional Cs
6/10
Average
K-12
1719
Lincoln El School
5/10
Average
K-5
339
Clearview El School
5/10
Average
K-5
353
Miller Heights El School
5/10
Average
K-5
419
Northeast Middle School
5/10
Average
6-8
786
Asa Packer El School
5/10
Average
K-5
338
James Buchanan El School
4/10
Below average
K-5
290
Governor Wolf El School
4/10
Below average
K-5
447
Marvine El School
4/10
Below average
PK-5
323
Nitschmann Middle School
4/10
Below average
6-8
808
Freedom High School
4/10
Below average
9-12
1926
Lehigh Valley Dual Language Charter School
4/10
Below average
K-8
459
Spring Garden El School
3/10
Below 

In [12]:
data = pd.DataFrame.from_dict(records)

In [15]:
print(data.shape)
data.head()

(116, 9)


Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District
0,Bethlehem Christian Pre-School,2/10,Below average,"1210 East 4th Street, Bethlehem, PA, 18015",Public district,PK-5,513,13:1,Bethlehem Area School District
1,Hanover El School,8/10,Above average,"3890 Jacksonville Road, Bethlehem, PA, 18017",Public district,K-5,244,15:1,Bethlehem Area School District
2,Calypso El School,7/10,Above average,"1021 Calypso Avenue, Bethlehem, PA, 18018",Public district,PK-5,243,13:1,Bethlehem Area School District
3,Lehigh Valley Charter High School for the Arts,7/10,Above average,"321 East 3rd Street, Bethlehem, PA, 18018",Public charter,9-12,627,8:1,Lehigh Valley Charter High School For The Arts
4,Thomas Jefferson El School,6/10,Average,"404 East North Street, Bethlehem, PA, 18018",Public district,K-5,238,13:1,Bethlehem Area School District


In [16]:
data.to_csv('df.csv')

In [None]:
print(df.shape)
df.head()