Load requisite packages

In [1]:
import numpy as np
import pandas as pd
import requests
import json
import time
from dfply import *
import s3fs
from datetime import datetime
import csv

# Part I: Extract/Transform

Load historical openaq data (already cleaned) which was collected using Amazon Athena and saved to disk 

In [2]:
df = pd.read_csv("/Users/halabanz/Desktop/openaq_project/historical_clean.txt")
df.head()

Unnamed: 0,parameter,location,value,unit,city,country,utc,local,latitude,longitude,hour_utc,hour_local
0,pm25,BONIFAY,-0.1,µg/m³,HOLMES,US,2016-03-17,2016-03-17,30.8475,-85.6044,12,6
1,pm25,HOOVER,6.8,µg/m³,BIRMINGHAM,US,2016-03-17,2016-03-17,33.3864,-86.8167,12,6
2,pm25,MCADORY,17.6,µg/m³,BIRMINGHAM,US,2016-03-17,2016-03-17,33.3311,-87.0036,12,6
3,pm25,COLUSA - SUNRISE BLV,5.0,µg/m³,COLUSA,US,2016-03-17,2016-03-17,39.2031,-122.0169,12,4
4,pm25,MONTG,14.9,µg/m³,MONTGOMERY,US,2016-03-17,2016-03-17,32.4069,-86.2564,12,6


Let's count how many observations there are

In [3]:
len(df)

19715374

To reduce computation and scrape time (df has close to 20 million rows), I remove all duplicate location names since they will all be in the same state

In [4]:
df.drop_duplicates(subset = "location", keep = "first", inplace = True)

Use Google's geocode API to get states for above cities

In [5]:
url_latlong = "https://maps.googleapis.com/maps/api/geocode/json?"

# Make sure to keep original df unaltered just in case
df_unique_coord = df.copy() 
df_unique_coord.sort_values("location", inplace = True)

# Creates list of coordinates for each city that I can feed into parameters
lat_long_list = zip(df_unique_coord['latitude'], df_unique_coord['longitude'])

# empty list to store new df values
state_df = []

# now iterate through these coordinates and retrieve states
for i,j in lat_long_list:
    
        parameters_latlong = {
        "latlng": str(i) + "," + str(j), 
        "location_type": "RANGE_INTERPOLATED",
        "result_type": "street_address",
        "key": "************"}
    
        response_latlong = requests.request("GET", url_latlong, params=parameters_latlong) 

        x_latlong = response_latlong.json() 
        
        results_list_latlong = x_latlong['results']
        
        # state names can be extracted from 'formatted_address'
        for address in results_list_latlong:
            if address != "":
                formatted_address = [address['formatted_address']]        
                state = pd.DataFrame(list(zip(formatted_address,[i],[j])), columns = ['formatted_address', 'latitude', 'longitude'])        
                state_df.append(state)
        
state_df = pd.concat(state_df)

# create state column
state_df['state'] = state_df['formatted_address'].map(lambda x: x.split(", ", 2)[2]).map(lambda x: x.split(" ", 2)[0])

# create city column
state_df['city'] = state_df['formatted_address'].map(lambda x: x.split(", ", 2)[1])

state_df.head()

Unnamed: 0,formatted_address,latitude,longitude,state,city
0,"3115 Alhambra Dr, Cameron Park, CA 95682, USA",38.684383,-120.98825,CA,Cameron Park
0,"3592 Ponderosa Rd, Shingle Springs, CA 95682, USA",38.67387,-120.94336,CA,Shingle Springs
0,"2727 Coffey Ave, Bellevue, NE 68123, USA",41.133316,-95.95608,NE,Bellevue
0,"2386 S Pinehurst Pl, Ontario, CA 91761, USA",34.0309,-117.6174,CA,Ontario
0,"18 Eureka Ave, Fairbanks, AK 99701, USA",64.84593,-147.69328,AK,Fairbanks


Find min/max long/lat for each state to form bounds. The idea is that when I join this data to the openaq data, I will avoid joining cities with the same names across different states.

In [10]:
# Make sure to keep state_df unaltered just in case
state_df_bounds = state_df.copy()

# Drop "formatted_address" since I already have state in separate col
state_df_bounds = state_df_bounds.drop(columns = ["formatted_address"])

# Group by state, then find min/max lat/long
state_df_bounds = state_df_bounds >> group_by(X.state) >> mutate(min_lat = X.latitude.min(), max_lat = X.latitude.max(), 
                                              min_long = X.longitude.min(), max_long = X.longitude.max())

# Drop duplicate cities
state_df_bounds = state_df_bounds.drop_duplicates(subset = ["city"], keep = 'first')

# drop coord columns
state_df_bounds = state_df_bounds.drop(columns = ['latitude', 'longitude'])

# capitalize city names to make consistent with openaq
state_df_bounds['city'] = state_df_bounds['city'].str.upper()

state_df_bounds['state'].nunique() # 50 states + DC + PR 

52

In [11]:
state_df_bounds.head(50)

Unnamed: 0,state,city,min_lat,max_lat,min_long,max_long
0,00730,PONCE,18.009556,18.009556,-66.62725,-66.62725
1,AK,FAIRBANKS,58.38889,64.84593,-161.767,-134.56555
2,AK,BETHEL,58.38889,64.84593,-161.767,-134.56555
3,AK,PALMER,58.38889,64.84593,-161.767,-134.56555
5,AK,JUNEAU,58.38889,64.84593,-161.767,-134.56555
6,AK,ANCHORAGE,58.38889,64.84593,-161.767,-134.56555
7,AK,NORTH POLE,58.38889,64.84593,-161.767,-134.56555
10,AK,EAGLE RIVER,58.38889,64.84593,-161.767,-134.56555
11,AL,CHICKASAW,30.7697,34.6917,-88.27792,-84.99966
12,AL,WARRIOR,30.7697,34.6917,-88.27792,-84.99966


I have pollution data for 755 cities across the US

In [12]:
state_df_bounds['city'].nunique()

755

# Part II: Load

Save to csv on disk

In [13]:
state_df_bounds.to_csv ("/Users/halabanz/Desktop/openaq_project/state/state_final_update.csv", index = None, header=True)