In [90]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time


In [91]:
#Example of how to get an address from Nominatim api using first entry from df
url_ex = """https://nominatim.openstreetmap.org/reverse?format=json&lat=47.5112&lon=-122.257"""
location_details_ex = requests.get(url_ex) #api request will return a json object
location_details_ex.json() #view json object

{'place_id': 144995141,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 236673600,
 'lat': '47.5112302',
 'lon': '-122.25676111324441',
 'display_name': '10012, 61st Avenue South, Rainier Beach, Tamill, Seattle, King County, Washington, 98178, United States of America',
 'address': {'house_number': '10012',
  'road': '61st Avenue South',
  'neighbourhood': 'Rainier Beach',
  'hamlet': 'Tamill',
  'city': 'Seattle',
  'county': 'King County',
  'state': 'Washington',
  'postcode': '98178',
  'country': 'United States of America',
  'country_code': 'us'},
 'boundingbox': ['47.511189', '47.5112943', '-122.2568571', '-122.2566651']}

In [92]:
#pull relevant information from json object to build address string
house_number_ex = location_details_ex.json()['address']['house_number']
road_ex = location_details_ex.json()['address']['road'].replace(' ','-')
city_ex = location_details_ex.json()['address']['city']
state_ex = location_details_ex.json()['address']['state']
zipcode_ex = location_details_ex.json()['address']['postcode']

address_elements_ex = [house_number_ex, road_ex, city_ex, state_ex, zipcode_ex]
address_sum_ex = '-'.join(address_elements_ex) #reformat elements so they can be used in a url

address_sum_ex

'10012-61st-Avenue-South-Seattle-Washington-98178'

In [93]:
score_page_ex = requests.get(f"https://www.walkscore.com/score/{address_sum_ex}")
soup_ex = BeautifulSoup(score_page_ex.content, 'lxml')
score_description_ex = soup_ex.find("span", id='score-description-sentence') 
#paragraph description with walkability score stated is found within a span tag

score_sentence_ex = score_description_ex.text.strip() #pull text from span tag

result = re.search('Walk Score of (.*) out of 100',score_sentence_ex) 
#search part of score_sentence that includes walkability score

result.group(1) #check result to ensure only the score was pulled


'39'

In [94]:
def get_walkability_score(latitude, longitude):
    """This function takes in a latitude and longitude for a location
    and returns a walkability score for that location.
    
    latitude should be a float or an integer.
    
    longitude should be a float or an integer."""
    
    url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={latitude}&lon={longitude}"
    location_details = requests.get(url) #api request will return a json object
    
    address = location_details.json()['display_name'] #only need the value from the display name key
    address_url = address.replace(',','').replace(' ', '-') #reformat address string
    
    score_page = requests.get(f"https://www.walkscore.com/score/{address_url}")
    soup = BeautifulSoup(score_page.content, 'lxml')
    score_description = soup.find("span", id="score-description-sentence") #sentence with walkability score
    
    if score_description == None:
        score_num = "Go back and check this address"
        
    else:
        score_sentence = score_description.text.strip() #sentence with leading and trailing spaces eliminated
        search_result = re.search("Walk Score of (.*) out of 100", score_sentence) #search for reg exp in sentence
        score_str = search_result.group(1) #walk score as a string
        score_num = int(score_str) #walk score as an integer
    
    
    return score_num


In [95]:
df = pd.read_csv('DataTable.csv')
coords = list(zip(df['lat'], df['long'])) #create list of tuples of latitude and longitude for each property

In [96]:
len(df)

21420

In [97]:
walk_scores_list_ex = []
for i in range(10):
    walk_ex = get_walkability_score(coords[i][0], coords[i][1])
    walk_scores_list_ex.append(walk_ex)
    time.sleep(0.1) 

In [98]:
walk_scores_list_ex

[39, 49, 19, 31, 6, 31, 55, 51, 21, 42]

The first attempt to scrape all coordinate pairs at the same time was unsuccessful. It seems that there is a buffering issue when making a bulk request to the Nominatim API. Therefore, we will only use a set of 2500 coordinates pairs at a time. The below 10 cells perform the same operation just on a different set of coordinate pairs.

In [99]:
#running a for loop on the first 2500 coordinate pairs
w_1 = []
for i in range(2500):
    walk_1 = get_walkability_score(coords[i][0], coords[i][1])
    w_1.append(walk_1)
    time.sleep(0.1) 

In [100]:
#running a for loop on the second 2500 coordinate pairs
time.sleep(5) #pause before starting
w_2 = []
for i in range(2500,5000):
    walk_2 = get_walkability_score(coords[i][0], coords[i][1])
    w_2.append(walk_2)
    time.sleep(0.1) 

In [101]:
#running a for loop on the third 2500 coordinate pairs
time.sleep(5)
w_3 = []
for i in range(5000,7500):
    walk_3 = get_walkability_score(coords[i][0], coords[i][1])
    w_3.append(walk_3)
    time.sleep(0.1) 

In [102]:
#running a for loop on the third 2500 coordinate pairs
time.sleep(5)
w_4 = []
for i in range(7500,10000):
    walk_4 = get_walkability_score(coords[i][0], coords[i][1])
    w_4.append(walk_4)
    time.sleep(0.1) 

In [103]:
#running a for loop on the fifth 2500 coordinate pairs
time.sleep(5)
w_5 = []
for i in range(10000,12500):
    walk_5 = get_walkability_score(coords[i][0], coords[i][1])
    w_5.append(walk_5)
    time.sleep(0.1) 

In [104]:
#running a for loop on the sixth 2500 coordinate pairs
time.sleep(5)
w_6 = []
for i in range(12500,15000):
    walk_6 = get_walkability_score(coords[i][0], coords[i][1])
    w_6.append(walk_6)
    time.sleep(0.1) 

In [105]:
#running a for loop on the seventh 2500 coordinate pairs
time.sleep(5)
w_7 = []
for i in range(15000,17500):
    walk_7 = get_walkability_score(coords[i][0], coords[i][1])
    w_7.append(walk_7)
    time.sleep(0.15) 

In [106]:
#running a for loop on the seventh 2500 coordinate pairs
time.sleep(5)
w_8 = []
for i in range(17500,20000):
    walk_8 = get_walkability_score(coords[i][0], coords[i][1])
    w_8.append(walk_8)
    time.sleep(0.15) 

In [107]:
#running a for loop on the seventh 2500 coordinate pairs
time.sleep(5)
w_9 = []
for i in range(20000,21420):
    walk_9 = get_walkability_score(coords[i][0], coords[i][1])
    w_9.append(walk_9)
    time.sleep(0.15) 

In [108]:
#combine above lists into one list
walk_scores_final = w_1+ w_2+ w_3 + w_4 + w_5 + w_6 + w_7 + w_8 + w_9  

In [109]:
#check length to make sure it matches dataframe length
len(walk_scores_final)

21420

In [110]:
#we need to check which coordinates did not return a walkability score
check_indices = []
for i in range(len(df)):
    if walk_scores_final[i] == 'Go back and check this address':
        check_indices.append(i)
    else:
        continue
        
check_indices
        

[580,
 2813,
 3901,
 7226,
 7278,
 7762,
 7871,
 11101,
 11523,
 11687,
 11755,
 12057,
 16114,
 16130,
 18423,
 19223,
 19900,
 21078]

There are 18 locations where we will need to look up the walkability score manually. If a walk score does not exist then that location will be given a value of 0.

In [111]:
for i in check_indices:
    print(coords[i])

(47.7255, -122.29700000000001)
(47.4997, -122.37899999999999)
(47.6828, -122.329)
(47.6425, -122.374)
(47.6425, -122.374)
(47.6863, -122.09299999999999)
(47.6977, -122.126)
(47.4489, -122.456)
(47.4203, -122.15700000000001)
(47.4224, -122.15899999999999)
(47.4198, -122.15799999999999)
(47.6181, -122.117)
(47.38800000000001, -122.234)
(47.3881, -122.234)
(47.5444, -122.165)
(47.6836, -122.208)
(47.7718, -122.208)
(47.6624, -121.868)


In [112]:
manual_scores = [80, 10, 83, 65, 65, 10, 19, 50, 8, 8, 5, 43, 79, 79, 55, 65, 45, 0]

In [113]:
#update walk_scores_final with correct walkability scores
for i in range(len(check_indices)):
    walk_scores_final[check_indices[i]] = manual_scores[i]

In [114]:
#final check to make sure we have integers for every index in walk_scores_final
for i in range(len(walk_scores_final)):
    if type(walk_scores_final[i]) == int:
        continue
    else:
        print(f"Problem at index {i}")

The above cell did not print anything to the screen which is great! The `walk_scores_final` list has nothing but integers, so we now have walkability scores for every coordinate pair.  We will put this list into a dataframe so that it can be saved as a csv file and concatenated to the master table later on.

In [116]:
walkability_df = pd.DataFrame(data=walk_scores_final, columns=['walk_score'])
walkability_df.head()

Unnamed: 0,walk_score
0,39
1,49
2,19
3,31
4,6


In [117]:
walkability_df.to_csv('walkability_scores.csv', index=False)