In [1]:
import requests
import time
import pandas as pd
pd.set_option('display.max_columns', None, 'display.max_colwidth', 500)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import os
api_key = os.environ['GMAP_API']
import json

In [None]:
#navigate to donation center map
driver = webdriver.Chrome()
driver.get('https://www.donatingplasma.org/donation/find-a-donor-center')
#wait for page to load correctly, sometimes it was slow
time.sleep(3)
results = driver.find_elements_by_class_name('result-inner')

In [None]:
#put info into dict
#I use 'properties.' names here to make geojson conversion easier later
centers = []
for center in results:
    try:
        post = {}
        post['properties.name'] = center.find_element_by_class_name('loc-name').text 
        post['properties.address'] = center.find_element_by_class_name('loc-address').text
        try:
            post['properties.phone'] = center.find_element_by_class_name('loc-phone').text
        except:
            pass
        centers.append(post)
    except:
        #check problems by hand:
        print(center.find_element_by_class_name('loc-name').text,'\n', center.find_element_by_class_name('loc-address').text,'\n---------------')

In [None]:
#check scraping worked
centers[:3]

In [None]:
#convert into df and save
df = pd.DataFrame(centers)
df.to_csv('donor_centers_raw.csv', index = False)
#check if all worked
df.head()

In [None]:
#-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
#uncomment this cell to load file instead of going through the scraping again
df = pd.read_csv('donor_centers_raw.csv')
#df.head()

In [None]:
#this function geocodes the addresses from the "address" column into new columns 'lat, 'long', county and 'state'
#for geojson a 'coordinates' key with both lat & long would've been better, but for all other use cases this is probably more helpful
def get_coordinates(row):
    # sometimes the API was a bit picky with me, so I inserted a time.sleep here. 
    #THIS WILL ADD 10+ MINUTES to the whole process if de-commented!
    # time.sleep(5)
    this_center = {}
    address = row['properties.address']
    url = 'https://maps.googleapis.com/maps/api/geocode/json'
    #make sure api_key is defined as environment variable
    keys = {'address': address, 'key': api_key}
    geo = requests.get(url,params=keys)
    geodata = geo.json()
    try:
        return pd.Series({'geometry.lat' : geodata['results'][0]['geometry']['location']['lat'], 'geometry.long' : geodata['results'][0]['geometry']['location']['lng'], 'properties.county' : geodata['results'][0]['address_components'][-4]['short_name'], 'properties.state' : geodata['results'][0]['address_components'][-3]['short_name']})
    except:
        print('Failed: ', geo.json())

In [None]:
# run function & save to new file
df = df.apply(get_coordinates, axis = 1).join(df)
df.to_csv('donor_centers_geocoded.csv', index = False)
df.head()

In [2]:
#-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
#uncomment this cell to load file instead of going through the geocoding again
#df = pd.read_csv('donor_centers_geocoded.csv')
#df.head()

In [3]:
#extract companies (as far as I get) - check for common names first
df['properties.name'].value_counts().head(10)

CSL Plasma                                 170
Grifols Biomat USA                         101
BioLife Plasma Services                     88
Grifols Talecris Plasma Resources, Inc.     73
Octapharma Plasma Inc.                      49
BPL Plasma, Inc.                            27
Biotest Plasma                              22
Octapharma Plasma, Inc.                     20
KEDPLASMA, LLC                              14
Plasma Service Europe GmbH                   9
Name: properties.name, dtype: int64

In [4]:
#very simple function to extract company names from the center name
list_of_companies = ['haema', 'grifols', 'biolife', 'csl', 'bpl', 'octapharm', 'biolife', 'biotest', 'kedplasma']
def get_company(row):
    name = row['properties.name'].lower()
    for company in list_of_companies:
        if company in name:
            return company

In [5]:
df['properties.company'] = df.apply(get_company, axis = 1)
df['properties.company'].value_counts()

csl          176
grifols      175
biolife       96
octapharm     72
haema         36
bpl           33
biotest       28
kedplasma     25
Name: properties.company, dtype: int64

In [6]:
#function builds geojson from df
# every column that has a 'properties.' in the name becomes a geojson property, '.geomotry' becomes a lat/long data point
def to_geojson(dataframe):
    geo_data = {"type": "FeatureCollection", "features":[]}
    for row in dataframe:        
        this_dict = {"type": "Feature", "properties":{}, "geometry": {}}
        this_dict['geometry']= {'type' : 'Point', 'coordinates' : [row['geometry.lat'], row['geometry.long']]}
        for key, value in row.items():
            key_names = key.split('.')            
            if str(key_names[0]) == 'properties':
                this_dict['properties'][key_names[1]] = value
        geo_data['features'].append(this_dict)
    return geo_data

In [7]:
# turn table the right way
centers_json = json.loads(df.to_json(orient='records'))
centers_geo = to_geojson(centers_json)

In [8]:
#write to file
with open('geo-data_centers.geojson', 'w') as outfile:
    json.dump(centers_geo, outfile)

In [None]:
#THIS IS ONLY FOR JONS METHOD TO RUN MAP W/O SERVER! REMOVE THIS COMMENT LINE BEFORE RUNNING!
#%%bash 
#wc -w geo-data_centers.js
#echo "infoData =" | cat - geo-data_centers.js > temp.js && mv temp.js geo-data_centers.js
#wc -w geo-data_centers.js