In [None]:
import os
import time
import folium
import geopandas
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='•••••••@••••••.com')
flatten = lambda l: [i for s in l for i in s]

import googlemaps as gmaps
gmaps = gmaps.Client(key='•••••••••••••••••••••••••••••••••••••••')

In [None]:
%%capture
%%tqdm.pandas()

### import all scraped data

In [None]:
cr = pd.read_csv('scraped_coursereport.csv')
portal = pd.read_csv('portal_data.csv')

### add survey data to degrees

In [None]:
survey = pd.read_csv('SurveyHero-Responses-249227.csv')

for ridx in survey.index:
    row = survey.loc[ridx]
    degrees = []
    if row["Master's"]=='x': degrees+= ["Masters"]
    if row["Bachelor"]=='x': degrees+= ["Bachelor's"]
    if row["PhD"]=='x': degrees += ["Ph.D."]
    survey.loc[ridx, 'class'] = ' / '.join(degrees)

survey = survey.rename(columns={
    'Institution/Organization:':'institution',
    'Program/Track name:':'course_name',
    'Location:.1': 'location'})
survey = survey[['institution', 'course_name', 'location', 'class']].dropna(subset=['institution']).iloc[:-2]


# combine Portal data with survey data
degs = pd.concat([portal, survey], sort=False)

### geocode Portal data

**(Using Google API because colleges & universities are often accurately located in Google maps)**

In [None]:
degs['geo_search'] = degs['institution'] + ', ' + degs['location'].fillna('')

def get_gmap_geos(locations, max_errors=1e9):
    failed = []
    geocoded = {}
    start_time = time.perf_counter()
    errors = 0
    for iteration, location in enumerate(locations):
        try:
            geocoded[location] = gmaps.geocode(location)
            time.sleep(.05) # <-- throttle
        except:
            time.sleep(2)
            try:
                geocoded[location] = gmaps.geocode(location)
            except:
                geocoded[location] = ''
                print('error at:', location)
                errors+=1
                failed.append(location)
        if max_errors and max_errors==errors:
            print('stopped at', iteration, '(max errors reached)')
            break
            return geocoded
        now = time.strftime('%H:%M:%S', time.localtime(time.time()))
        print(f'\r{now} | {iteration}/{len(locations)} locations geocoded ({round(iteration/len(locations)*100, 2)}%) | {errors} total errors', end='')
    print(f'\njob completed in {time.perf_counter() - start_time}s\n{errors} requests timed out')
    print(len([v for v in geocoded.values() if v=='error']), 'locations not found')
    return geocoded, failed

def gmap_gcoder(df):
    locations = set(df['geo_search'])

    geocoded, failed = get_gmap_geos(locations)

    df['gmaps_info'] = df['geo_search'].apply(lambda x: geocoded[x]) # map results onto back to df

    def extract_coords(geo):
        try:
            geo = geo[-1] # drop generalized extra results
            return geo['geometry']['location']['lat'], geo['geometry']['location']['lng']
        except:
            return np.nan, np.nan

    df['coord'] = df['gmaps_info'].apply(extract_coords)
    df['latitude'] = df['coord'].apply(lambda x: x[0])
    df['longitude'] = df['coord'].apply(lambda x: x[1])
    return df

In [None]:
# RUN THE GEOCODER - will take 10-15 minutes

degs_gcoded = gmap_gcoder(degs)

In [None]:
degs_gcoded = degs_gcoded[~degs_gcoded.loc[:, :'description'].duplicated()]
degs_gcoded.to_csv('degs_gcoded.csv', index=False)

note: more data (from online) is added to `mp_gcoded` (in Notebook 1.5) before mapping

---

## format & geocode Bootcamp (CourseReport) data

explode rows by city & geocode with Nominatim:

**(Using Nominatim because bootcamps often don't show up in Google Maps so the longer search time is is not worth it)**

In [None]:
def geo_split(df, l_feat, multi_locs=True):
    if multi_locs:
        locs = set(flatten(df[l_feat].str.split(', ')))
    else:
        locs = set(df[l_feat])

    loc_geos = {}
    for l in tqdm(locs):
        l = l.split('(')[0] # messy but quick to write
        sleep_time = .25 #init
        max_tries = 4
        while True:
            if max_tries <= 0:
                print('TIMEOUT: Error at', l)
                break
            try:
                time.sleep(sleep_time)
                sleep_time  *= 2 # double wait time in each retry 
                gc = geolocator.geocode(l) 
                geo = (gc.latitude, gc.longitude)
                loc_geos[l] = geo
                break
            except:
                max_tries-=1

    indi_locs = []
    for r_idx in df.index:
        d_row = df.loc[r_idx]
        if multi_locs:
            for loc in d_row[l_feat].split(', '):
                if loc!='-' and loc.lower()!='online':
                    new_row = d_row.copy()
                    new_row[l_feat] = loc
                    new_row['latitude'] = loc_geos[loc][0]
                    new_row['longitude'] = loc_geos[loc][1]
                    indi_locs.append(new_row)
        else:
            try:
                loc = d_row[l_feat].split('(')[0]
                if loc!='-' and loc.lower()!='online' and loc!='Multiple locations':
                    new_row = d_row.copy()
                    new_row[l_feat] = loc
                    new_row['latitude'] = loc_geos[loc][0]
                    new_row['longitude'] = loc_geos[loc][1]
                    indi_locs.append(new_row)
            except: print('error at', d_row)
    return pd.DataFrame(indi_locs)

In [None]:
cr_gcoded = geo_split(cr, 'Location')
boot_locs.to_csv('boot_locs.csv', index=False)