## Creating schools.csv

1. Install packages
2. Create cities.csv with full state name/ city column to use in getting school information
3. For persisitance creating a schools csv using selenium to get school information from greatschools.org
4. Clean csv for use in schools endpoint

### 1. Import necessary libraries

In [None]:
from bs4 import BeautifulSoup
import os
import requests
import csv
import json
import pandas as pd
from state_abbr import us_state_abbrev as abbr
from selenium import webdriver
import urllib.parse
import re

### 2.  Create cities.csv with full state name/ city column to use in getting school information

In [None]:
pwd = os.getcwd()

# create city state list
cities = pd.read_excel('notebooks/datasets/data/schools/csv/List of Cities.xlsx')

# just get the second and third colun
cities = cities[['Unnamed: 1','Unnamed: 2']]

# create new dictionary with reversed key, value pairs
full = dict(map(reversed, abbr.items()))

# map state abbreviations to full name
cities['states'] = cities['Unnamed: 2'].map(full)

# making sure state/city combo conform to url format of "-" for " "
cities['states'] = cities['states'].str.strip()
cities['states'] = cities['states'].str.replace(" ", "-")
cities['Unnamed: 1'] = cities['Unnamed: 1'].str.replace(" ", "-")

# remove extraneous header rows
cities = cities.iloc[2:]
cities['city'] = (cities['states'] + '/'+ cities['Unnamed: 1']).str.lower()
print(cities.head())

# persist by creating new csv
cities.to_csv('notebooks/datasets/data/schools/csv/cities.csv')

### 3. For persisitance creating a schools csv using selenium and Beautiful Soup to get school information from greatschools.org

In [22]:
# Looping through each city in the file
# Call cities csv to get cities stored in database
cities = pd.read_csv('csv/cities.csv')

# Getting school information
School = []
Score = []
Rating = []
Address = []
Type = []
Grades = []
Enrollment = []
Student_Teacher = []
District = []

driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True
  page = 0
  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        page_status_text = page_status.text.strip()
        page_status_regex = re.search(r".* (\d+) to (\d+) of (\d+)", page_status_text)
        beginning, ending, total = page_status_regex.groups()
        if int(ending) >= int(total):
            fetching = False

        # appending school information for current cities
        # SCHOOL NAME
        for school in soup.find_all('a', {'class': 'name'}):
            School.append(school.text.strip())

        # RATING
        try: 
            for rating in soup.find_all('div', {'class': 'circle-rating--small'}):
                numeric = rating.text.strip()
                Rating.append(numeric)
        except AttributeError:
            numeric = 0

        # SCORE
        for score in soup.find_all('div', {'class': 'scale'}):
            Score.append(score.text.strip())

        # ADDRESS
        for address in soup.find_all('div', {'class': 'address'}):
            Address.append(address.text.strip())
        
driver.close()

schools = pd.DataFrame(list(zip(School, Score, Rating, Address)), columns = ['School', 'Score', 'Rating', 'Address'])

iew=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=18&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=19&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=20&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=21&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=22&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=23&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=24&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=25&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=26&tableView=Overview&view=table
Fetching  http://www.greatschools.org/arizona/tucson/schools/?page=27&tableV

In [24]:
print(schools.shape)
schools.head()

(33004, 4)


Unnamed: 0,School,Score,Rating,Address
0,Akron Early College High School,Top rated,10/10,"225 South Main Street, Akron, OH, 44308"
1,Revere Middle School,Above average,9/10,"3195 Spring Valley Road, Akron, OH, 44333"
2,Arrowhead Primary Elementary School,Above average,8/10,"1600 Raleigh Boulevard, Akron, OH, 44321"
3,Manchester Middle School,Above average,8/10,"760 West Nimisila Road, Akron, OH, 44319"
4,Nolley Elementary School,Above average,8/10,"6285 Renninger Rd, Akron, OH, 44319"


In [25]:
# 5. For persisitance creating a schools csv
schools.to_csv('csv/schools_cleaned.csv')

In [None]:
# 6. Capturing the information for the remaining table separately
# Looping through each city in the file

# create empty dataframe
df = pd.DataFrame()

for i in cities['city']:
    endpoint = url_pre + urllib.parse.quote(i) + url_post
    print("Fetching ", endpoint)
    driver.get(endpoint)
    html = driver.page_source
    table = pd.read_html(html)
    # appending to dataframe all the school information for current cities
    df = df.append(table[0])

driver.close()