## Creating schools.csv

1. Install packages
2. Create cities.csv with full state name/ city column to use in getting school information
3. For persisitance creating a schools csv using selenium to get school information from greatschools.org
4. Clean csv for use in schools endpoint

### 1. Import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import os
import requests
import csv
import json
import pandas as pd
from state_abbr import us_state_abbrev as abbr
from selenium import webdriver
import urllib.parse
import re
import html5lib



### 2.  Create cities.csv with full state name/ city column to use in getting school information

In [2]:
# pwd = os.getcwd()

# # create city state list
# cities = pd.read_excel('notebooks/datasets/data/schools/csv/List of Cities.xlsx')

# # just get the second and third colun
# cities = cities[['Unnamed: 1','Unnamed: 2']]

# # create new dictionary with reversed key, value pairs
# full = dict(map(reversed, abbr.items()))

# # map state abbreviations to full name
# cities['states'] = cities['Unnamed: 2'].map(full)

# # making sure state/city combo conform to url format of "-" for " "
# cities['states'] = cities['states'].str.strip()
# cities['states'] = cities['states'].str.replace(" ", "-")
# cities['Unnamed: 1'] = cities['Unnamed: 1'].str.replace(" ", "-")

# # remove extraneous header rows
# cities = cities.iloc[2:]
# cities['city'] = (cities['states'] + '/'+ cities['Unnamed: 1']).str.lower()
# print(cities.head())

# # persist by creating new csv
# cities.to_csv('notebooks/datasets/data/schools/csv/cities.csv')

### 3. For persisitance creating a schools csv using selenium and Beautiful Soup to get school information from greatschools.org

In [3]:
# Looping through each city in the file
cities = pd.read_csv('csv/cities.csv')

# selenium driver
driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True

  records = []

  page = 0

  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        page_status_text = page_status.text.strip()
        print(page_status_text)
        page_status_regex = re.search(r".* (\d+) to (\d+) of (\d+)", page_status_text)
        beginning, ending, total = page_status_regex.groups()
        if int(ending) >= int(total):
            fetching = False
        table = soup.find("table", { "class" : "" })
        for row in table.find_all("tr"):
            cell = row.find_all("td")
            if len(cell) == 7:
                school = row.find('a', {'class':'name'}).text.strip()
                try:
                    score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
                except AttributeError:
                    score = '0/10'
                rating = row.find('div', {'class': 'scale'}).text.strip()
                try:
                    address = row.find('div', {'class': 'address'}).text.strip()
                except AttributeError:
                    address = "Unavailable"
                school_type = cell[1].find(text=True)
                grade =  cell[2].find(text=True)
                students =  cell[3].find(text=True)
                student_teacher_ratio =  cell[4].find(text=True)
                try: 
                    district =  cell[6].find(text=True)
                except AttributeError:
                    district = 'Unavailable'

            records.append({ 
                'School': school, 
                'Score': score, 
                'Rating': rating, 
                'Address': address, 
                'Type': school_type,
                'Grades' : grade,
                'Total Students Enrolled': students,
                'Students per teacher' : student_teacher_ratio, 
                'District': district
                })

driver.close()

ching  http://www.greatschools.org/michigan/warren/schools/?page=3&tableView=Overview&view=table
Showing 51 to 75 of 79 schools found in Warren, MI
Fetching  http://www.greatschools.org/michigan/warren/schools/?page=4&tableView=Overview&view=table
Showing 76 to 79 of 79 schools found in Warren, MI
Fetching  http://www.greatschools.org/ohio/warren/schools/?page=1&tableView=Overview&view=table
Showing 1 to 25 of 59 schools found in Warren, OH
Fetching  http://www.greatschools.org/ohio/warren/schools/?page=2&tableView=Overview&view=table
Showing 26 to 50 of 59 schools found in Warren, OH
Fetching  http://www.greatschools.org/ohio/warren/schools/?page=3&tableView=Overview&view=table
Showing 51 to 59 of 59 schools found in Warren, OH
Fetching  http://www.greatschools.org/pennsylvania/warren/schools/?page=1&tableView=Overview&view=table
Showing 1 to 15 of 15 schools found in Warren, PA
Fetching  http://www.greatschools.org/district-of-columbia/washington/schools/?page=1&tableView=Overview&vi

In [4]:
df = pd.DataFrame.from_dict(records)

In [5]:
print(df.shape)
df.head()

(109, 9)


Unnamed: 0,School,Score,Rating,Address,Type,Grades,Total Students Enrolled,Students per teacher,District
0,Desert View Academy,8/10,Above average,"2363 South Kennedy Lane, Yuma, AZ, 85365",Public charter,K-6,789,21:1,Juniper Tree Academy
1,Gowan Science Academy,8/10,Above average,"1590 South Ave C, Yuma, AZ, 85364",Public district,K-6,407,27:1,Crane Elementary District
2,Cibola High School,7/10,Above average,"4100 West 20th Street, Yuma, AZ, 85364",Public district,9-12,2488,26:1,Yuma Union High School District
3,Amerischools Academy North,7/10,Above average,"1220 South 4th Avenue, Yuma, AZ, 85364",Public charter,K-8,209,17:1,The Charter Foundation Inc.
4,Alice Byrne Elementary School,6/10,Average,"811 West 16th Street, Yuma, AZ, 85364",Public district,PK-5,345,22:1,Yuma Elementary District


In [None]:
df.to_csv('csv/schools.csv')

In [None]:
df = pd.read_csv('csv/schools.csv')
print(df.shape)
df.head()