## Convert address to lat/lng

In [1]:
import pandas as pd
import requests
from tqdm.auto import tqdm
tqdm.pandas()



In [2]:
df = pd.read_csv('05-address.csv')
df.head()

Unnamed: 0,school,count,circuit,address
0,Harvard University,24,District of Columbia,"Massachusetts Hall, Cambridge, MA 02138"
1,Yale University,16,District of Columbia,"105 Wall Street, New Haven, CT 06520"
2,Stanford University,14,District of Columbia,"450 Serra Mall, Stanford, CA 94305-2061"
3,Duke University,11,District of Columbia,"207 Allen Bldg, Durham, NC 27708"
4,University of Virginia,9,District of Columbia,"1827 University Avenue, Charlottesville, VA 22904"


In [3]:
API_KEY = ''

In [4]:
def geocode(address):

    params = {
        'q': address,
        'apikey': API_KEY
    }

    response = requests.get('https://geocode.search.hereapi.com/v1/geocode', params=params)
    data = response.json()
    if len(data['items']) > 0:
        match = data['items'][0]
        # Create a dictionary of everything important
        result = {
            'lat': match['position']['lat'],
            'lng': match['position']['lng']
        }
        return pd.Series(result)

# use tqdm to get the time
geocoded = df.address.progress_apply(geocode)

geocoded.head()

  0%|          | 0/591 [00:00<?, ?it/s]

Unnamed: 0,lat,lng
0,42.37717,-71.13507
1,41.31115,-72.92677
2,37.42765,-122.17006
3,35.99981,-78.94058
4,38.03662,-78.50258


In [5]:
# Combine dataframes
df_merge = pd.concat([df, geocoded], axis=1)
df_merge.head()

Unnamed: 0,school,count,circuit,address,lat,lng
0,Harvard University,24,District of Columbia,"Massachusetts Hall, Cambridge, MA 02138",42.37717,-71.13507
1,Yale University,16,District of Columbia,"105 Wall Street, New Haven, CT 06520",41.31115,-72.92677
2,Stanford University,14,District of Columbia,"450 Serra Mall, Stanford, CA 94305-2061",37.42765,-122.17006
3,Duke University,11,District of Columbia,"207 Allen Bldg, Durham, NC 27708",35.99981,-78.94058
4,University of Virginia,9,District of Columbia,"1827 University Avenue, Charlottesville, VA 22904",38.03662,-78.50258


In [6]:
# Check the nulls -- should only be international schools
df_merge[df_merge.isnull().any(axis=1)]

Unnamed: 0,school,count,circuit,address,lat,lng
17,University of Oxford,2,District of Columbia,,6.48812,2.6138
29,"Heidelberg University, Germany",1,District of Columbia,,6.48812,2.6138
72,"Heidelberg University, Germany",1,Eighth,,6.48812,2.6138
140,University of Cambridge,3,Federal,,6.48812,2.6138
196,University of Cambridge,3,Fifth,,6.48812,2.6138
235,University of Oxford,6,First,,6.48812,2.6138
253,University of Puerto Rico School of Public Adm...,1,First,,6.48812,2.6138
309,University of Oxford,4,Ninth,,6.48812,2.6138
373,European University Institute,4,Second,,6.48812,2.6138
377,University of Cambridge,3,Second,,6.48812,2.6138


In [7]:
# Replace the NaN lat/lng values with actual NaNs
df_merge['lat'].replace(6.48812, np.nan, inplace=True)
df_merge['lng'].replace(2.6138, np.nan, inplace=True)

In [8]:
# Remove NAs so they don't show up in the final map
df_merge = df_merge.dropna()
# df_merge[df_merge.isnull().any(axis=1)]

In [9]:
# Save file
df_merge.to_csv('06-geocode.csv', index=False)