## Creating schools.csv

1. Install packages
2. Create cities.csv with full state name/ city column to use in getting school information
3. For persisitance creating a schools csv using selenium to get school information from greatschools.org
4. Clean csv for use in schools endpoint

### 1. Import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import os
import requests
import csv
import json
import pandas as pd
from state_abbr import us_state_abbrev as abbr
from selenium import webdriver
import urllib.parse
import re
import html5lib



### 2.  Create cities.csv with full state name/ city column to use in getting school information

In [None]:
pwd = os.getcwd()

# create city state list
cities = pd.read_excel('notebooks/datasets/data/schools/csv/List of Cities.xlsx')

# just get the second and third colun
cities = cities[['Unnamed: 1','Unnamed: 2']]

# create new dictionary with reversed key, value pairs
full = dict(map(reversed, abbr.items()))

# map state abbreviations to full name
cities['states'] = cities['Unnamed: 2'].map(full)

# making sure state/city combo conform to url format of "-" for " "
cities['states'] = cities['states'].str.strip()
cities['states'] = cities['states'].str.replace(" ", "-")
cities['Unnamed: 1'] = cities['Unnamed: 1'].str.replace(" ", "-")

# remove extraneous header rows
cities = cities.iloc[2:]
cities['city'] = (cities['states'] + '/'+ cities['Unnamed: 1']).str.lower()
print(cities.head())

# persist by creating new csv
cities.to_csv('notebooks/datasets/data/schools/csv/cities.csv')

### 3. For persisitance creating a schools csv using selenium and Beautiful Soup to get school information from greatschools.org

In [None]:
# Looping through each city in the file
# Call cities csv to get cities stored in database
cities = pd.read_csv('csv/cities.csv')

# Extracting school information
School = []
Score = []
Rating = []
Address = []

driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True
  page = 0
  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        page_status_text = page_status.text.strip()
        page_status_regex = re.search(r".* (\d+) to (\d+) of (\d+)", page_status_text)
        beginning, ending, total = page_status_regex.groups()
        if int(ending) >= int(total):
            fetching = False

        # appending school information for current cities
        # SCHOOL NAME
        for school in soup.find_all('a', {'class': 'name'}):
            School.append(school.text.strip())

        # RATING
        try: 
            for rating in soup.find_all('div', {'class': 'circle-rating--small'}):
                numeric = rating.text.strip()
                Rating.append(numeric)
        except AttributeError:
            numeric = 0

        # SCORE
        for score in soup.find_all('div', {'class': 'scale'}):
            Score.append(score.text.strip())

        # ADDRESS
        for address in soup.find_all('div', {'class': 'address'}):
            Address.append(address.text.strip())
        
driver.close()

schools = pd.DataFrame(list(zip(School, Score, Rating, Address)), columns = ['School', 'Score', 'Rating', 'Address'])

In [None]:
print(schools.shape)
schools.head()

In [None]:
# 5. For persisitance creating a schools csv
schools.to_csv('csv/schools_cleaned.csv')

In [4]:
# 6. Capturing the information for the remaining table separately
# Looping through each city in the file
# cities = pd.read_csv('csv/cities.csv')
cities = pd.read_csv('csv/cities.csv')

School = []

# create empty dataframe
df = pd.DataFrame()

driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True
  page = 0
  while fetching:  
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)       
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        page_status_text = page_status.text.strip()
        print(page_status_text)
        page_status_regex = re.search(r".* (\d+) to (\d+) of (\d+)", page_status_text)
        beginning, ending, total = page_status_regex.groups()
        if int(ending) >= int(total):
            fetching = False

        table = pd.read_html(html)
        # appending to dataframe all the school information for current cities
        df = df.append(table[0])

        for school in soup.find_all('a', {'class': 'name'}):
            print(school.text.strip())
            School.append(school.text.strip())

driver.close()

ter
Head Start
Salmon Center For Early Ed
Fetching  http://www.greatschools.org/michigan/wyoming/schools/?page=1&tableView=Overview&view=table
Showing 1 to 25 of 50 schools found in Wyoming, MI
Vanguard Charter Academy
Grandville Century Park Learning Center
West Godwin Elementary School
Lee Middle School
Grandville Grand View Elementary School
Pinewood Middle School
Rogers High School
Lee High School
Jackson Park Intermediate
West Elementary School
Newhall Middle School
Parkview Elementary School
Oriole Park Elementary School
Godfrey Elementary School
Valleywood Middle School
San Juan Diego Academy
Tri-Unity Christian Elementary School
Spartan Stores Ymca Child Care Center
Beech Head Start
Appletree Christian Learning Center
Hastings Day Care
For the Kidz Gymnastics
Wyoming Regional Center
Godfrey-Lee Early Childhood Center
St. John Vianney School
Vanguard Charter Academy
Grandville Century Park Learning Center
West Godwin Elementary School
Lee Middle School
Grandville Grand View Elem

In [5]:
column_values = pd.Series(School)

In [6]:
column_values.to_csv('school_name.csv')

In [7]:
df.to_csv('schools.csv')

In [9]:
df.insert(loc=0, column='School Name', value=column_values)

In [10]:
df.head()

Unnamed: 0,School Name,School,Type,Grades,Total students enrolled,Students per teacher,Reviews,District
0,Akron Early College High School,10/10Top ratedAkron Early College High School2...,Public district,9-12,384.0,34:1,8 reviews,Akron City School District
1,Revere Middle School,9/10Above averageRevere Middle School3195 Spri...,Public district,6-8,624.0,13:1,1 review,Revere Local School District
2,Arrowhead Primary Elementary School,8/10Above averageArrowhead Primary Elementary ...,Public district,K-4,345.0,20:1,6 reviews,Copley-Fairlawn City School District
3,Manchester Middle School,8/10Above averageManchester Middle School760 W...,Public district,5-8,387.0,16:1,2 reviews,Manchester Local School District
4,Nolley Elementary School,8/10Above averageNolley Elementary School6285 ...,Public district,K-4,483.0,17:1,2 reviews,Manchester Local School District


In [12]:
schools = pd.read_csv('csv/schools_cleaned.csv')

In [13]:
schools = schools.rename(columns = {'School': 'School Name'})

In [14]:
schools = schools.merge(df, on=['School Name'])

In [15]:
schools.head()

Unnamed: 0.1,Unnamed: 0,School Name,Score,Rating,Address,School,Type,Grades,Total students enrolled,Students per teacher,Reviews,District
0,0,Akron Early College High School,Top rated,10/10,"225 South Main Street, Akron, OH, 44308",10/10Top ratedAkron Early College High School2...,Public district,9-12,384.0,34:1,8 reviews,Akron City School District
1,0,Akron Early College High School,Top rated,10/10,"225 South Main Street, Akron, OH, 44308",4/10Below averageBetty Jane Community Learning...,Public district,K-5,453.0,16:1,6 reviews,Akron City School District
2,0,Akron Early College High School,Top rated,10/10,"225 South Main Street, Akron, OH, 44308",3/10Below averageRoosevelt Elementary School66...,Public district,PK-3,409.0,17:1,No reviews yet,Springfield Local School District
3,0,Akron Early College High School,Top rated,10/10,"225 South Main Street, Akron, OH, 44308",Currently unratedWonder World Infant Dcc1360 E...,Private,PK,,,No reviews yet,
4,0,Akron Early College High School,Top rated,10/10,"225 South Main Street, Akron, OH, 44308",Currently unratedNordonia Hills Ymca Day Care2...,Private,PK,,,No reviews yet,


In [16]:
print(schools.shape)

(126272, 12)


In [17]:
schools.to_csv('new_schools.csv')