## Creating schools.csv

1. Install packages
2. Create cities.csv with full state name/ city column to use in getting school information
3. For persisitance creating a schools csv using selenium to get school information from greatschools.org
4. Clean csv for use in schools endpoint

### 1. Import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import os
import requests
import csv
import json
import pandas as pd
from state_abbr import us_state_abbrev as abbr
from selenium import webdriver
import urllib.parse
import re
import html5lib



### 2.  Create cities.csv with full state name/ city column to use in getting school information

In [2]:
# pwd = os.getcwd()

# # create city state list
# cities = pd.read_excel('notebooks/datasets/data/schools/csv/List of Cities.xlsx')

# # just get the second and third colun
# cities = cities[['Unnamed: 1','Unnamed: 2']]

# # create new dictionary with reversed key, value pairs
# full = dict(map(reversed, abbr.items()))

# # map state abbreviations to full name
# cities['states'] = cities['Unnamed: 2'].map(full)

# # making sure state/city combo conform to url format of "-" for " "
# cities['states'] = cities['states'].str.strip()
# cities['states'] = cities['states'].str.replace(" ", "-")
# cities['Unnamed: 1'] = cities['Unnamed: 1'].str.replace(" ", "-")

# # remove extraneous header rows
# cities = cities.iloc[2:]
# cities['city'] = (cities['states'] + '/'+ cities['Unnamed: 1']).str.lower()
# print(cities.head())

# # persist by creating new csv
# cities.to_csv('notebooks/datasets/data/schools/csv/cities.csv')

### 3. For persisitance creating a schools csv using selenium and Beautiful Soup to get school information from greatschools.org

In [3]:
# Looping through each city in the file
# cities = pd.read_csv('csv/cities.csv')

# These cities were not fully scraped, returned a truncated list
cities = ['illinois/chicago', 'texas/houston', 'california/los-angeles', 'florida/miami', 'new-york/new-york', 'texas/san-antonio']

records = []
total_schools = []

# selenium driver
driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True

  page = 0

  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        page_status_text = page_status.text.strip()
        print(page_status_text)
        page_status_regex = re.search(r".* (\d+) to (\d+) of (\d+)", page_status_text)
        beginning, ending, total = page_status_regex.groups()
        total_schools.append(total)
        if int(ending) >= int(total):
            fetching = False
            
        table = soup.find("table", { "class" : "" })
        for row in table.find_all("tr"):
            cell = row.find_all("td")
            if len(cell) == 7:
                school = row.find('a', {'class':'name'}).text.strip()
                try:
                    score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
                except AttributeError:
                    score = '0/10'
                rating = row.find('div', {'class': 'scale'}).text.strip()
                try:
                    address = row.find('div', {'class': 'address'}).text.strip()
                except AttributeError:
                    address = "Unavailable"
                school_type = cell[1].find(text=True)
                grade =  cell[2].find(text=True)
                students =  cell[3].find(text=True)
                student_teacher_ratio =  cell[4].find(text=True)
                try: 
                    district =  cell[6].find(text=True)
                except AttributeError:
                    district = 'Unavailable'

                records.append({ 
                    'School': school, 
                    'Score': score, 
                    'Rating': rating, 
                    'Address': address, 
                    'Type': school_type,
                    'Grades' : grade,
                    'Total Students Enrolled': students,
                    'Students per teacher' : student_teacher_ratio, 
                    'District': district
                    })

driver.close()

Fetching  http://www.greatschools.org/ohio/akron/schools/?page=1&tableView=Overview&view=table
Showing 1 to 25 of 208 schools found in Akron, OH
Fetching  http://www.greatschools.org/ohio/akron/schools/?page=2&tableView=Overview&view=table
Showing 26 to 50 of 208 schools found in Akron, OH
Fetching  http://www.greatschools.org/ohio/akron/schools/?page=3&tableView=Overview&view=table
Showing 51 to 75 of 208 schools found in Akron, OH
Fetching  http://www.greatschools.org/ohio/akron/schools/?page=4&tableView=Overview&view=table
Showing 76 to 100 of 208 schools found in Akron, OH
Fetching  http://www.greatschools.org/ohio/akron/schools/?page=5&tableView=Overview&view=table
Showing 101 to 125 of 208 schools found in Akron, OH
Fetching  http://www.greatschools.org/ohio/akron/schools/?page=6&tableView=Overview&view=table
Showing 126 to 150 of 208 schools found in Akron, OH
Fetching  http://www.greatschools.org/ohio/akron/schools/?page=7&tableView=Overview&view=table
Showing 151 to 175 of 208

In [4]:
df = pd.DataFrame.from_dict(records)

In [5]:
print(df.shape)
df.head()

(58782, 9)


Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District
0,Akron Early College High School,10/10,Top rated,"225 South Main Street, Akron, OH, 44308",Public district,9-12,384,34:1,Akron City School District
1,Revere Middle School,9/10,Above average,"3195 Spring Valley Road, Akron, OH, 44333",Public district,6-8,624,13:1,Revere Local School District
2,Arrowhead Primary Elementary School,8/10,Above average,"1600 Raleigh Boulevard, Akron, OH, 44321",Public district,K-4,345,20:1,Copley-Fairlawn City School District
3,Manchester Middle School,8/10,Above average,"760 West Nimisila Road, Akron, OH, 44319",Public district,5-8,387,16:1,Manchester Local School District
4,Nolley Elementary School,8/10,Above average,"6285 Renninger Rd, Akron, OH, 44319",Public district,K-4,483,17:1,Manchester Local School District


In [6]:
df.to_csv('files/schools.csv')

In [7]:
df = pd.read_csv('files/schools.csv')
print(df.shape)
df.head()

(58782, 10)


Unnamed: 0.1,Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District
0,0,Akron Early College High School,10/10,Top rated,"225 South Main Street, Akron, OH, 44308",Public district,9-12,384.0,34:1,Akron City School District
1,1,Revere Middle School,9/10,Above average,"3195 Spring Valley Road, Akron, OH, 44333",Public district,6-8,624.0,13:1,Revere Local School District
2,2,Arrowhead Primary Elementary School,8/10,Above average,"1600 Raleigh Boulevard, Akron, OH, 44321",Public district,K-4,345.0,20:1,Copley-Fairlawn City School District
3,3,Manchester Middle School,8/10,Above average,"760 West Nimisila Road, Akron, OH, 44319",Public district,5-8,387.0,16:1,Manchester Local School District
4,4,Nolley Elementary School,8/10,Above average,"6285 Renninger Rd, Akron, OH, 44319",Public district,K-4,483.0,17:1,Manchester Local School District


### Creating new csv
- for cities that were truncated during scraping
- retrieved first 25 rather than all the records

In [1]:
from bs4 import BeautifulSoup
import os
import requests
import csv
import json
import pandas as pd
from files.state_abbr import us_state_abbrev as abbr
from selenium import webdriver
import urllib.parse
import re
import html5lib



In [2]:
# Looping through each city in the file

# These cities were not fully scraped, returned a truncated list
cities = ['illinois/chicago', 'texas/houston', 'california/los-angeles', 'florida/miami', 'new-york/new-york', 'texas/san-antonio']

records = []
total_schools = []

# selenium driver
driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities:
  fetching = True

  page = 0

  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        print(page_status.text.strip())
        page_status_list = page_status.text.strip().split()
        ending = (page_status_list[3]).replace(',', '')
        total = (page_status_list[5]).replace(',' , '')     

        total_schools.append(total)
        if int(ending) >= int(total):

            fetching = False
            
        table = soup.find("table", { "class" : "" })
        for row in table.find_all("tr"):
            cell = row.find_all("td")
            if len(cell) == 7:
                school = row.find('a', {'class':'name'}).text.strip()
                try:
                    score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
                except AttributeError:
                    score = '0/10'
                rating = row.find('div', {'class': 'scale'}).text.strip()
                try:
                    address = row.find('div', {'class': 'address'}).text.strip()
                except AttributeError:
                    address = "Unavailable"
                school_type = cell[1].find(text=True)
                grade =  cell[2].find(text=True)
                students =  cell[3].find(text=True)
                student_teacher_ratio =  cell[4].find(text=True)
                try: 
                    district =  cell[6].find(text=True)
                except AttributeError:
                    district = 'Unavailable'

                records.append({ 
                    'School': school, 
                    'Score': score, 
                    'Rating': rating, 
                    'Address': address, 
                    'Type': school_type,
                    'Grades' : grade,
                    'Total Students Enrolled': students,
                    'Students per teacher' : student_teacher_ratio, 
                    'District': district
                    })

driver.close()

Fetching  http://www.greatschools.org/illinois/chicago/schools/?page=1&tableView=Overview&view=table
Showing 1 to 25 of 1,850 schools found in Chicago, IL
Fetching  http://www.greatschools.org/illinois/chicago/schools/?page=2&tableView=Overview&view=table
Showing 26 to 50 of 1,850 schools found in Chicago, IL
Fetching  http://www.greatschools.org/illinois/chicago/schools/?page=3&tableView=Overview&view=table
Showing 51 to 75 of 1,850 schools found in Chicago, IL
Fetching  http://www.greatschools.org/illinois/chicago/schools/?page=4&tableView=Overview&view=table
Showing 76 to 100 of 1,850 schools found in Chicago, IL
Fetching  http://www.greatschools.org/illinois/chicago/schools/?page=5&tableView=Overview&view=table
Showing 101 to 125 of 1,850 schools found in Chicago, IL
Fetching  http://www.greatschools.org/illinois/chicago/schools/?page=6&tableView=Overview&view=table
Showing 126 to 150 of 1,850 schools found in Chicago, IL
Fetching  http://www.greatschools.org/illinois/chicago/schoo

In [3]:
df_missing = pd.DataFrame.from_dict(records)

In [5]:
df_missing.to_csv('files/missing_schools.csv')
print(df_missing.shape)
df_missing.head()

(8941, 9)


Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District
0,Jones College Prep High School,10/10,Top rated,"700 South State Street, Chicago, IL, 60605",Public district,9-12,1861,17:1,City Of Chicago School District 299
1,Northside College Preparatory High School,10/10,Top rated,"5501 North Kedzie Avenue, Chicago, IL, 60625",Public district,9-12,1061,16:1,City Of Chicago School District 299
2,Payton College Preparatory High School,10/10,Top rated,"1034 North Wells Street, Chicago, IL, 60610",Public district,9-12,1073,15:1,City Of Chicago School District 299
3,Devry Advantage Academy High School,10/10,Top rated,"3300 North Campbell Avenue, Chicago, IL, 60618",Public district,11-12,193,18:1,City Of Chicago School District 299
4,Westinghouse High School,9/10,Above average,"3223 West Franklin Boulevard, Chicago, IL, 60624",Public district,9-12,1192,16:1,City Of Chicago School District 299
