In [11]:
import pandas as pd
import numpy as np

import googlemaps

In [12]:

# read in cleaned data sets


df1 = pd.read_csv('../data/csv/sampled_wells_cleaned.csv', converters={'id': lambda x: str(x.strip()),
                                                                'zip': lambda x: str(x.strip()),
                                                                'city': lambda x: str(x.strip()),
                                                                'add': lambda x: str(x.strip())})

df2 = pd.read_csv('../data/csv/permitted_wells_cleaned.csv', converters={'id': lambda x: str(x.strip()),
                                                                'zip': lambda x: str(x.strip()),
                                                                'add_zip': lambda x: str(x.strip()),
                                                                'city': lambda x: str(x.strip()),
                                                                'add': lambda x: str(x.strip())})




In [13]:
# joins df2 to df1 on the following columns. 

df2 = df2[['id', 'X', 'Y']]

ar = pd.merge(df1, df2, on='id', how='left')


In [14]:
# checks the number of valid entries in each column

ar.count()

add            1687
city           1687
state          1687
zip            1687
id             1687
date           1687
ar             1687
ph             1687
sample_id      1687
date_tested    1687
year_tested    1687
group          1687
group_five     1687
group_mcl      1687
X               718
Y               718
dtype: int64

In [15]:
# creates single address string for geocoding

ar['full_add'] = ar['add'] + ', ' + ar['city'] + ', ' + 'NC ' + ar['zip']

In [16]:
# checks the number of missing values in the new columns

ar[['X', 'Y']].isnull().sum()

X    969
Y    969
dtype: int64

In [17]:
ar = ar[['id','full_add', 'date_tested', 'year_tested', 'X', 'Y', 'ar', 'group', 'group_five', 'group_mcl', 'ph']]

972 samples  will have to be geocoded using the Google Maps API

Run if not all samples got XY coordinates from permit data

Comment out if no geocoding is needed

In [18]:
api_key = "AIzaSyD4MWa0YgnE8mvIIxxTqJzMbzqippwbOFs"
gmaps_key = googlemaps.Client(key=api_key)

# geocodes useing full address ('full_add') for the ar dataframe, 
# outputs X and Y coordinates into seperate new geoX and geoY columns
# the geocode function will use the googlemaps library and geocode api to geocode the addresses

def geocode(row):
    try:
        result = gmaps_key.geocode(row['full_add'])
        geoX = result[0]['geometry']['location']['lng']
        geoY = result[0]['geometry']['location']['lat']
        return pd.Series([geoX, geoY])
    except:
        return pd.Series([np.nan, np.nan])

# applies the geocode function to the ar dataframe
# the geocode function will create two new columns, geoX and geoY, in the ar dataframe

ar[['geoX', 'geoY']] = ar.apply(geocode, axis=1)

# check the number of missing values in the new columns

ar[['geoX', 'geoY']].isnull().sum()



geoX    0
geoY    0
dtype: int64

In [19]:
ar['sample_id'] = ar.index

ar.to_csv("../data/csv/ar_samples_merged_xy.csv", index=False)