In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8

# Google search using Python

### Description and external resources:

> This script uses two related functions to scrape the best URL from online sources: 
>> The Google Places API. See the [GitHub page](https://github.com/slimkrazy/python-google-places) for the Python wrapper and sample code, [Google Web Services](https://developers.google.com/places/web-service/) for general documentation, and [here](https://developers.google.com/places/web-service/details) for details on Place Details requests.

>> The Google Search function (manually filtered). See [here](https://pypi.python.org/pypi/google) for source code and [here](http://pythonhosted.org/google/) for documentation.

> To get an API key for the Google Places API (or Knowledge Graph API), go to the [Google API Console](http://code.google.com/apis/console).
> To upgrade your quota limits, sign up for billing--it's free and raises your daily request quota from 1K to 150K (!!).

> The code below doesn't use Google's Knowledge Graph (KG) Search API because this turns out NOT to reveal websites related to search results (despite these being displayed in the KG cards visible at right in a standard Google search). The KG API is only useful for scraping KG id, description, name, and other basic/ irrelevant info. TO see examples of how the KG API constructs a search URL, etc., (see [here](http://searchengineland.com/cool-tricks-hack-googles-knowledge-graph-results-featuring-donald-trump-268231)).

> Possibly useful note on debugging: An issue causing the GooglePlaces package to unnecessarily give a "ValueError" and stop was resolved in [July 2017](https://github.com/slimkrazy/python-google-places/issues/59).
> Other instances of this error may occur if Google Places API cannot identify a location as given. Dealing with this is a matter of proper Exception handling (which seems to be working fine below).

## Initializing Python search environment

In [2]:
# IMPORTING KEY PACKAGES
from google import search  # automated Google Search package
from googleplaces import GooglePlaces, types, lang  # Google Places API

import csv, re, os  # Standard packages
import urllib, requests  # for scraping

In [3]:
# Initializing Google Places API search functionality
places_api_key = re.sub("\n", "", open("places_api_key.txt").read())
print(places_api_key)

google_places = GooglePlaces(places_api_key)

AIzaSyDdcpqQCSL5O-ypA_g0-n6cRWHcvrZVQhM


In [32]:
def dicts_to_csv(list_of_dicts, file_name, header):
    '''This helper function writes a list of dictionaries to a csv called file_name, with column names decided by 'header'.'''
    
    with open(file_name, 'w') as output_file:
        print("Saving to " + str(file_name) + " ...")
        dict_writer = csv.DictWriter(output_file, header)
        dict_writer.writeheader()
        dict_writer.writerows(list_of_dicts)

In [4]:
# Here's a list of sites we DON'T want to spider, 
# but that an automated Google search might return...
# and we might thus accidentally spider unless we filter them out (as below)!

bad_sites = []
with open('../bad_sites.csv', 'r', encoding = 'utf-8') as csvfile:
    for row in csvfile:
        bad_sites.append(re.sub('\n', '', row))

# print(bad_sites)

In [5]:
# See the Google Places API wrapper at work!
school_name = "River City Scholars Charter Academy"
address = "944 Evergreen Street, Grand Rapids, MI 49507"

query_result = google_places.nearby_search(
        location=address, name=school_name,
        radius=15000, types=[types.TYPE_SCHOOL], rankby='distance')

for place in query_result.places:
    print(place.name)
    place.get_details()  # makes further API call
    #print(place.details) # A dict matching the JSON response from Google.
    print(place.website)
    print(place.formatted_address)

# Are there any additional pages of results?
if query_result.has_next_page_token:
    query_result_next_page = google_places.nearby_search(
            pagetoken=query_result.next_page_token)

River City Scholars
http://rivercityscholars.org/
944 Evergreen St SE, Grand Rapids, MI 49507, USA


In [41]:
# Example of using the google search function:
for url in search('DR DAVID C WALKER INT 6500 IH 35 N STE C, SAN ANTONIO, TX 78218', \
                  stop=5, pause=5.0):
    print(url)

http://www.har.com/school/015806106/dr-david-c-walker-elementary-school
https://www.excellence-sa.org/walker
https://www.niche.com/k12/dr-david-c-walker-intermediate-school-san-antonio-tx/
https://www.facebook.com/pages/Dr-David-C-Walker-Int/598905323548274
https://www.neighborhoodscout.com/tx/san-antonio/schools/480006211404
https://www.homesnap.com/schools/TX/San_Antonio/Dr_David_C_Walker_Intermediate
http://www.charterschooltools.org/charterSchools.cfm?stateID=43
https://www.localdatabase.com/texas/San_Antonio
https://egov.uscis.gov/crisgwi/go?action=offices.summary&OfficeLocator.office_type=&&OfficeLocator.statecode=TX
http://www.texastroopstoteachers.org/images/TEACharters09-city..pdf


## For reference (deprecated)

In [7]:
## Helpful bash-fu

'''
!cat > bad_sites

  114  cat > testlist.txt
  115  cat testlist.txt 
  116  for i in $(cat testlist.txt | head -n 4); do echo $i; done
  117  for i in $(cat testlist.txt | head -n 4); do echo wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done
  118  for i in $(cat testlist.txt | head -n 4); do echo wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; echo; echo; done
  119  for i in $(cat testlist.txt | head -n 4); do wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done
  120  ls -la
  121  rm -f 500425-s 55362003.pdf franklin-benjamin-charter-school-mesa index.html 
  122* for i in $(cat testlist.txt | head -n 4); do wget --mirror --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done
'''

"\n!cat > bad_sites\n\n  114  cat > testlist.txt\n  115  cat testlist.txt \n  116  for i in $(cat testlist.txt | head -n 4); do echo $i; done\n  117  for i in $(cat testlist.txt | head -n 4); do echo wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done\n  118  for i in $(cat testlist.txt | head -n 4); do echo wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; echo; echo; done\n  119  for i in $(cat testlist.txt | head -n 4); do wget --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done\n  120  ls -la\n  121  rm -f 500425-s 55362003.pdf franklin-benjamin-charter-school-mesa index.html \n  122* for i in $(cat testlist.txt | head -n 4); do wget --mirror --exclude-domains=$(echo $(cat ../Charter-school-identities/bad_sites.txt  ) | tr ' ' ,) $i; done\n"

## Reading in data

In [13]:
sample = []  # make empty list in which to store the dictionaries

if os.path.exists('../sample.csv'):  # first, check if file containing search results is available on disk
    file_path = '../sample.csv'
else:  # use original data if no existing results are available on disk
    file_path = '../charter_URLs_Apr17.csv'

with open(file_path, 'r', encoding = 'utf-8')\
as csvfile: # open file                      
    print('  Reading in ' + str(file_path) + ' ...')
    reader = csv.DictReader(csvfile)  # create a reader
    for row in reader:  # loop through rows
        sample.append(row)  # append each row to the list

  Reading in ../sample.csv ...


In [9]:
# Create new "URL" variable for each school, without overwriting any with data there already:
for school in sample:
    try:
        if school["URL"]:
            pass
        
    except (KeyError, NameError):
        school["URL"] = ""

In [22]:
def count_left(list_of_dicts, varname):
    '''This helper function determines how many dicts in list_of_dicts don't have a valid key/value pair with key varname.'''
    
    count = 0
    for school in list_of_dicts:
        if school[varname] == "" or school[varname] == None:
            count += 1

    print(str(count) + " schools in this data are missing " + str(varname) + "s.")

count_left(sample, 'URL')

75 schools in this data are missing URLs.


In [10]:
# Take a look at the first entry's contents and the variables list in our sample (a list of dictionaries)
print(sample[1]["SEARCH"], "\n", sample[1]["OLD_URL"], "\n", sample[1]["ADDRESS"], "\n")
print(sample[1].keys())

POLK STATE COLLEGE COLLEGIATE HIGH SCHOOL 3425 WINTER LK RD LAC1200, WINTER HAVEN, FL 33881 
 https://www.polk.edu/charter-high-schools/ 
 3425 WINTER LK RD LAC1200, WINTER HAVEN, FL 33881 

dict_keys(['NCESSCH', 'STABR', 'ADDRESS', 'SEARCH', 'NUM_BAD_URLS', 'URL', 'MANUAL_URL', 'OLD_URL', 'SCH_NAME'])


## Getting URLs

In [31]:
def getURL(school_name, address, bad_sites_list, manual_url, known_urls):
    
    '''This function finds the one best URL for a school using two methods:
    
    1. If a school with this name can be found within 20 km (to account for proximal relocations) in
    the Google Maps database (using the Google Places API), AND
    if this school has a website on record, then this website is returned.
    If no school is found, the school discovered has missing data in Google's database (latitude/longitude, 
    address, etc.), or the address on record is unreadable, this passes to method #2. 
    
    2. An automated Google search using the school's name + address. This is an essential backup plan to 
    Google Places API, because sometimes the address on record (courtesy of Dept. of Ed. and our tax dollars) is not 
    in Google's database. For example, look at: "3520 Central Pkwy Ste 143 Mezz, Cincinnati, OH 45223". 
    No wonder Google Maps can't find this. How could it intelligibly interpret "Mezz"?
    
    Whether using the first or second method, this function excludes URLs with any of the 62 bad_sites defined above, 
    e.g. trulia.com, greatschools.org, mapquest. It returns the number of excluded URLs (from either method) 
    and the first non-bad URL discovered.'''
    
    
    ## INITIALIZE
    
    new_urls = []    # start with empty list
    good_url = ""    # output goes here
    k = 0    # initialize counter for number of URLs skipped
    
    radsearch = 15000  # define radius of Google Places API search, in km
    numgoo = 20  # define number of google results to collect for method #2
    
    search_terms = school_name + " " + address
    print("Getting URL for", school_name + ", " + address + "...")    # show school name & address
    
    
    
    ## FIRST URL-SCRAPE ATTEMPT: GOOGLE PLACES API
    # Search for nearest school with this name within radsearch km of this address
    
    try:
        query_result = google_places.nearby_search(
            location=address, name=school_name,
            radius=radsearch, types=[types.TYPE_SCHOOL], rankby='distance')
        
        for place in query_result.places:
            place.get_details()  # Make further API call to get detailed info on this place

            found_name = place.name  # Compare this name in Places API to school's name on file
            found_address = place.formatted_address  # Compare this address in Places API to address on file

            try: 
                url = place.website  # Grab school URL from Google Places API, if it's there

                if any(domain in url for domain in bad_sites_list):
                    k+=1    # If this url is in bad_sites_list, add 1 to counter and move on

                else:
                    good_url = url
                    known_urls.append(good_url)
                    print("    Success! URL obtained from Google Places API with " + str(k) + " bad URLs avoided.")
                    
                    '''
                    # For testing/ debugging purposes:
                    
                    print("  VALIDITY CHECK: Is the discovered URL of " + good_url + \
                          " consistent with the known URL of " + manual_url + " ?")
                    print("  Also, is the discovered name + address of " + found_name + " " + found_address + \
                          " consistent with the known name/address of: " + search_terms + " ?")
                    '''
                    
                    if manual_url != "":
                        if manual_url == good_url:
                            print("    Awesome! The known and discovered URLs are the SAME!")
                            
                    return(k, good_url)  # Returns valid URL of the Place discovered in Google Places API
        
            except:  # No URL in the Google database? Then try next API result or move on to Google searching.
                print("  No URL available through Google Places API. Moving on to Google search.")
                pass
    
    except:
        print("  Google Places API search failed. Moving on to Google search.")
        pass
    
    

    ## SECOND URL-SCRAPE ATTEMPT: FILTERED GOOGLE SEARCH
    # Automate Google search and take first result that doesn't have a bad_sites_list element in it.
    
    # Grab first numgoo Google results (URLs):
    new_urls = list(search(search_terms, stop=numgoo, pause=7.5))
    
    # Loop through google search output to find first good result:
    for url in new_urls:
        if any(domain in url for domain in bad_sites_list):
            k+=1    # If this url is in bad_sites_list, add 1 to counter and move on
            # print("Bad site detected. Moving on.")
        else:
            good_url = url
            known_urls.append(good_url)
            print("    Success! URL obtained by Google search with " + str(k) + " bad URLs avoided.")
            break    # Exit for loop after first good url is found
            
            
        
    '''
    # For testing/ debugging purposes:
    
    if k>2:  # Print warning messages depending on number of bad sites preceding good_url
        print("  WARNING!! CHECK THIS URL!: " + good_url + \
              "\n" + str(k) + " bad Google results have been omitted.")
    if k>1:
        print(str(k) + " bad Google results have been omitted. Check this URL!")
    elif k>0:
        print(str(k) + " bad Google result has been omitted. Check this URL!")
    else: 
        print("No bad sites detected. Reliable URL!")
    '''
    
    
    if manual_url != "":
        if manual_url == good_url:
            print("    Awesome! The known and discovered URLs are the SAME!")
    
    if good_url == "":
        print("  WARNING! No good URL found via API or google search\n")
    
    return(k, good_url)

In [23]:
numschools = 0  # initialize scraping counter
known_URLs = []  # initialize list of known URLs

keys = sample[0].keys()  # define keys for writing function
fname = "../sample.csv"  # define file name for writing function

In [18]:
# Now to call the above function and actually scrape these things!
for school in sample: # loop through list of schools
    if school["URL"] == "":  # if URL is missing, fill that in by scraping
        numschools += 1
        school["NUM_BAD_URLS"], school["URL"] = "", "" # start with empty strings
        school["NUM_BAD_URLS"], school["URL"] = getURL(school["SCH_NAME"], school["ADDRESS"], bad_sites, school["MANUAL_URL"], known_URLs)
    
    else:
        if school["URL"]:
            pass  # If URL exists, don't bother scraping it again

        else:  # If URL hasn't been defined, then scrape it!
            numschools += 1
            school["NUM_BAD_URLS"], school["URL"] = "", "" # start with empty strings
            school["NUM_BAD_URLS"], school["URL"] = getURL(school["SCH_NAME"], school["ADDRESS"], bad_sites, school["MANUAL_URL"], known_URLs)

print("\n\nURLs discovered for " + str(numschools) + " schools.")

Getting URL for ACADEMY OF ACCELERATED LEARNING 6711 BELLFORT ST, HOUSTON, TX 77087 ...
Getting URL for ACADEMY OF ARTS & MINDS 3138 COMMODORE PLZ, MIAMI, FL 33133 ...
Getting URL for Academy of Arts and Academics 615 Main St, Springfield, OR 97477 ...
Getting URL for Academy of Arts and Sciences 3038 Leavitt Rd, Lorain, OH 44052 ...
Getting URL for Academy of Arts and Sciences: Del Mar Elementary (K-5) 4560 Alvarado Canyon Rd., San Diego, CA 92120 ...
Getting URL for Academy of Arts and Sciences: Del Mar Middle & High (6-12) 4560 Alvarado Canyon Rd., San Diego, CA 92120 ...
Getting URL for Academy of Arts and Sciences: El Cajon Elementary (K-5) 4560 Alvarado Canyon Rd., San Diego, CA 92120 ...
Getting URL for Academy of Arts and Sciences: Fresno 1865 Herndon Ave. K88, Clovis, CA 93611 ...
Getting URL for Academy of Arts and Sciences: Los Angeles (9-12) 550 S. Hope St. #2600, Los Angeles, CA 90071 ...
Getting URL for Academy of Arts and Sciences: Los Angeles (K-8) 550 S. Hope St. #2600

Getting URL for Achievement Preparatory PCS Elementary 1500 Mississippi Avenue SE, Washington, DC 20032 ...
Getting URL for Acorn Montessori Charter School 8556 East Loos Drive, Prescott Valley, AZ 86314 ...
Getting URL for Acorn Montessori Charter School  Inc. - West 7555 E. Long Look Drive, Prescott Valley, AZ 86314 ...
Getting URL for ACT Academy Cyber CS 2111 Eastburn Avenue, Philadelphia, PA 19138 ...
Getting URL for Ad Prima CS 3556 Frankford Avenue, Philadelphia, PA 19134 ...
Getting URL for Adalberto M. Guerrero School 2797 N. Introspect Drive, Tucson, AZ 85745 ...
Getting URL for Adams Traditional Academy 2323 W. Parkside Lane, Phoenix, AZ 85027 ...
Getting URL for ADDENBROOKE CLASSICAL ACADEMY 480 SOUTH KIPLING, LAKEWOOD, CO 80215 ...
Getting URL for Adelante Charter 1102 E. Yanonali St., Santa Barbara, CA 93103 ...
Getting URL for Advanced Learning Academy of Wisconsin 1050 E Woodland Ave, Barron, WI 54812 ...
Getting URL for Advanced Math and Science Academy Charter School 

Getting URL for Alliance Health Services Academy High 12226 S. W.ern Ave., Los Angeles, CA 90047 ...
Getting URL for Alliance Jack H. Skirball Middle 603 E. 115th St., Los Angeles, CA 90059 ...
Getting URL for Alliance Judy Ivie Burton Technology Academy High 10101 S. BRd.way, Los Angeles, CA 90003 ...
Getting URL for Alliance Marc & Eva Stern Math and Science 5151 State University Dr., Los Angeles, CA 90032 ...
Getting URL for Alliance Margaret M. Bloomfield Technology Academy High 8691 California Ave., South Gate, CA 90280 ...
Getting URL for Alliance Media Arts and Entertainment Design High 113 S. Rowan Ave., Los Angeles, CA 90063 ...
Getting URL for Alliance Ouchi-O'Donovan 6-12 Complex 5356 S. Fifth Ave., Los Angeles, CA 90043 ...
Getting URL for Alliance Patti And Peter Neuwirth Leadership Academy 4610 S. Main St., Los Angeles, CA 90037 ...
Getting URL for Alliance Renee and Meyer Luskin Academy High 2941 W. 70th St., Los Angeles, CA 90043 ...
Getting URL for Alliance School of M

Getting URL for Andrew Academy 4050 E 38th St, Indianapolis, IN 46218 ...
Getting URL for Andrew H. Wilson Charter School 3617 General Pershing Street, New Orleans, LA 70125 ...
Getting URL for Andrew J Brown Academy 3600 N German Church Rd, Indianapolis, IN 46236 ...
Getting URL for ANIMAS HIGH SCHOOL 3206 MAIN AVENUE, DURANGO, CO 81320 ...
Getting URL for Animo Avalon Charter Middle 12700 Avalon Blvd., Los Angeles, CA 90061 ...
Getting URL for Animo College Preparatory Academy 2265 E. 103rd St., Los Angeles, CA 90002 ...
Getting URL for Animo Ellen Ochoa Charter Middle 5156 Whittier Blvd., Los Angeles, CA 90022 ...
Getting URL for Animo Inglewood Charter High 3425 W. Manchester Blvd., Inglewood, CA 90305 ...
Getting URL for Animo Jackie Robinson High 3500 S. Hill St., Los Angeles, CA 90007 ...
Getting URL for Animo James B. Taylor Charter Middle 810 E. 111th Pl., Los Angeles, CA 90059 ...
Getting URL for Animo Jefferson Charter Middle 1655 E. 27th St., Ste. A, Los Angeles, CA 90011 .

Getting URL for ARLINGTON CLASSICS ACADEMY - MIDDLE 5200 S BOWEN RD, ARLINGTON, TX 76017 ...
Getting URL for ARLINGTON CLASSICS ACADEMY - PRI 2800 W ARKANSAS LN, ARLINGTON, TX 76016 ...
Getting URL for ARLINGTON CLASSICS ACADEMY -INTERMEDIATE 5200 S BOWEN RD, ARLINGTON, TX 76017 ...
Getting URL for Arlington Community Charter School 1200 Main St, Arlington, OR 97812 ...
Getting URL for Armadillo Technical Institute 106 Rose St, Phoenix, OR 97535 ...
Getting URL for ARROW ACADEMY - CHAMPIONS ACADEMY 2113 CYPRESS LANDING DR, HOUSTON, TX 77090 ...
Getting URL for ARROW ACADEMY - HARVEST PREPARATORY ACADEMY 17770 IMPERIAL VALLEY DR, HOUSTON, TX 77060 ...
Getting URL for ARROW ACADEMY - LAS AMERICAS LEARNING CENTER 5808 RENWICK DR, HOUSTON, TX 77081 ...
Getting URL for ARROW ACADEMY - LIBERATION ACADEMY 401 PRESENT ST, MISSOURI CITY, TX 77489 ...
Getting URL for ARROW ACADEMY - MCCORMACK HONORS ACADEMY 5874 BELLFORT ST, HOUSTON, TX 77033 ...
Getting URL for ARROW ACADEMY - ODYSSEY PREPARATO

Getting URL for ATHLOS LEADERSHIP ACADEMY 1701 GOLIAD RD, SAN ANTONIO, TX 78223 ...
Getting URL for Atlanta Classical Academy 3260 Northside Dr NW, Atlanta, GA 30305 ...
Getting URL for Atlanta Heights Charter School 3670 Martin Luther King Jr Dr, Atlanta, GA 30331 ...
Getting URL for Atlanta Neighborhood Charter - Elementary 688 Grant St SE, Atlanta, GA 30315 ...
Getting URL for Atlanta Neighborhood Charter - Middle 820 Essie Ave SE, Atlanta, GA 30316 ...
Getting URL for Atlantic City Community Charter School 200 North Texas Ave, Atlantic City, NJ 8401 ...
Getting URL for ATLANTIC MONTESSORI CHARTER SCHOOL 9893 PINES BLVD, PEMBROKE PINES, FL 33024 ...
Getting URL for Atlantis Charter School 2501 South Main Street, Fall River, MA 2724 ...
Getting URL for ATLAS PREPARATORY SCHOOL 1628 SOUTH MURRAY BLVD, COLORADO SPRINGS, CO 80916 ...
Getting URL for Audeo Charter 10170 Huennekens St., San Diego, CA 92121 ...
Getting URL for AUDRE AND BERNARD RAPOPORT ACADEMY 2000 J J FLEWELLEN RD, WACO,

HTTPError: HTTP Error 503: Service Unavailable

> The above approach works to get a good URL for 6,677 out of the 6,752 schools in this data set. Not bad! 

> For some reason, the Google search algorithm (method #2) is less likely to work after passing from the Google Places API.

> To fill in for the remaining 75, let's skip the function's layers of code and just call the google search function by hand.

In [33]:
for school in sample:
    if school["URL"] == "":
        k = 0  # initialize counter for number of URLs skipped
        school["NUM_BAD_URLS"] = ""

        print("Scraping URL for " + school["SEARCH"] + "...")
        urls_list = list(search(school["SEARCH"], stop=20, pause=10.0))
        print("  URLs list collected successfully!")

        for url in urls_list:
            if any(domain in url for domain in bad_sites):
                k+=1    # If this url is in bad_sites_list, add 1 to counter and move on
                # print("  Bad site detected. Moving on.")
            else:
                good_url = url
                print("    Success! URL obtained by Google search with " + str(k) + " bad URLs avoided.")

                school["URL"] = good_url
                school["NUM_BAD_URLS"] = k
                
                count_left(sample, 'URL')
                dicts_to_csv(sample, fname, keys)
                print()
                break    # Exit for loop after first good url is found                               
                                           
    else:
        pass

count_left(sample, 'URL')
dicts_to_csv(sample, fname, keys)

Scraping URL for GLOBAL VILLAGE ACADEMY - FORT COLLINS 8005 HIGHLAND MEADOWS PARKWAY, FORT COLLINS, CO 80528...
  URLs list collected successfully!
    Success! URL obtained by Google search with 8 bad URLs avoided.
9 schools in this data are missing URLs.
Saving to ../sample.csv ...

Scraping URL for Graham Primary School 140 E 16th Ave, Columbus, OH 43201...
  URLs list collected successfully!
    Success! URL obtained by Google search with 0 bad URLs avoided.
8 schools in this data are missing URLs.
Saving to ../sample.csv ...

Scraping URL for GRAND CENTER ARTS ACADEMY HIGH 711 NORTH GRAND AVENUE, ST. LOUIS, MO 63103...
  URLs list collected successfully!
    Success! URL obtained by Google search with 0 bad URLs avoided.
7 schools in this data are missing URLs.
Saving to ../sample.csv ...

Scraping URL for Great Hearts Academies - Archway Glendale Parcel #200-08-098E, Peoria, AZ 85383...
  URLs list collected successfully!
    Success! URL obtained by Google search with 2 bad URLs

In [40]:
sample[2700:]

[{'ADDRESS': '2124 E ST ELMO RD A, AUSTIN, TX 78744',
  'MANUAL_URL': '',
  'NCESSCH': '480024113063',
  'NUM_BAD_URLS': '0',
  'OLD_URL': '',
  'SCH_NAME': 'HARMONY SCHOOL OF INNOVATION - AUSTIN',
  'SEARCH': 'HARMONY SCHOOL OF INNOVATION - AUSTIN 2124 E ST ELMO RD A, AUSTIN, TX 78744',
  'STABR': 'TX',
  'URL': 'http://www.hsiaustin.org/'},
 {'ADDRESS': '1024 W ROSEMEADE PKWY, CARROLLTON, TX 75007',
  'MANUAL_URL': '',
  'NCESSCH': '480029212548',
  'NUM_BAD_URLS': '0',
  'OLD_URL': 'http://www.hsatx.org/',
  'SCH_NAME': 'HARMONY SCHOOL OF INNOVATION - DALLAS',
  'SEARCH': 'HARMONY SCHOOL OF INNOVATION - DALLAS 1024 W ROSEMEADE PKWY, CARROLLTON, TX 75007',
  'STABR': 'TX',
  'URL': 'http://hsicarrollton.org/'},
 {'ADDRESS': '5210 FAIRBANKS DR, EL PASO, TX 79924',
  'MANUAL_URL': '',
  'NCESSCH': '480027212339',
  'NUM_BAD_URLS': '0',
  'OLD_URL': 'http://hsielpaso.org/',
  'SCH_NAME': 'HARMONY SCHOOL OF INNOVATION - EL PASO',
  'SEARCH': 'HARMONY SCHOOL OF INNOVATION - EL PASO 5210 F

In [27]:
# Save sample to file (can continue to load and add to it):
count_left(sample, 'URL')
dicts_to_csv(sample, fname, keys)

54 schools in this data are missing URLs.
Saving to ../sample.csv...


In [39]:
count_left(sample, 'URL')

0 schools in this data are missing URLs.
