In [323]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

from shapely.geometry import shape, Point, Polygon
from shapely.ops import unary_union

%matplotlib inline
pd.set_option("display.max.columns", None)

## TODOs:
- Retitle columns to be useful
- Find more complete census tract geojson/one that isn't hosted by latimes is liable to go away at any moment
- Explore census tracts without sensors

### PurpleAir Data descriptors
##### Determined
- pm = current PM2.5 reading
- pm1 = raw PM1 reading
- pm_10 = raw PM10 reading
- pm_0 = current PM2.5 reading
- pm_1 = 10 minute PM2.5 average
- pm_2 = 30 minute PM2.5 average
- pm_3 = 1 hour PM2.5 average
- pm_4 = 6 hour PM2.5 average
- pm_5 = 24 hour PM2.5 average
- pm_6 = One week PM2.5 average
- p1 = Particles >= 0.3 µm
- p2 = Particles >= 0.5 µm
- p3 = Particles >= 1.0 µm
- p4 = Particles >= 2.5 µm
- p5 = Particles >= 5.0 µm
- p6 = Particles >= 10.0 µm
- flags = Data flagged for unusually high readings
- age = Sensor data age (when data was last received) in minutes
- isOwner = Currently logged in user is the sensor owner
- Adc = The voltage reading on the analog input of the control board

##### Undetermined
- conf
- Icon
- Voc
- Ozone1
- CH

In [2]:
# Returns dataframe of active purpleair sensors contained within LA County
def los_angeles_county_sensors(sensors_df):
    # Get LA County geometries in JSON form
    url = 'https://opendata.arcgis.com/datasets/10f1e37c065347e693cf4e8ee753c09b_15.geojson'
    shapes = requests.get(url).json()
    
    la_county_indeces = []
    
    # Union of individual LA County shapes. Speeds up check if sensor is in LAC.
    polygons = [shape(feature['geometry']) for feature in shapes['features']]
    lac_shape = unary_union(polygons)
    
    # Iterate through purpleair sensors
    for index, row in sensors_df.iterrows():
        # Build Point object from sensor's Lon and Lat (in that order!) values
        point = Point(row.Lon, row.Lat)
        
        # Check if LA Count polygon contains point
        if lac_shape.contains(point):
            la_county_indeces.append(index)
                
    return sensors_df.loc[la_county_indeces]

In [369]:
# Use 'experimental' data from purpleair
url = 'https://www.purpleair.com/data.json'
data = requests.get(url).json()

In [375]:
sensors_df = pd.DataFrame(data['data'], columns=data['fields'])
sensors_df = sensors_df.set_index('ID')

cols_to_keep = ['pm', 'pm1', 'pm_10', 'age', 'Lat', 'Lon']
cols_to_drop = [col for col in sensors_df.columns if col not in cols_to_keep]

sensors_df = sensors_df.drop(cols_to_drop, axis=1)

# Get purpleair sensors in LA County
la_county_sensors_df = los_angeles_county_sensors(sensors_df)
la_county_sensors_df

Unnamed: 0_level_0,pm,age,pm1,pm_10,Lat,Lon
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
407,8.1,0,6.6,9.4,33.842610,-118.339360
417,13.5,1,13.0,14.8,33.716675,-118.309906
489,5.6,0,4.3,6.6,33.795550,-118.260940
565,10.5,3632,7.8,12.1,33.709614,-118.287970
567,7.9,1,5.9,8.8,34.126790,-118.062220
...,...,...,...,...,...,...
70793,6.9,1,4.1,8.0,34.009174,-118.409040
71409,11.1,8651,6.9,14.0,34.191765,-118.125946
72249,6.8,0,4.6,7.4,34.114580,-118.295000
72541,6.9,0,4.4,7.9,33.998460,-118.477090


#### Census Tract data is provided by LATimes (not ideal), but is complete.

In [103]:
test_url = 'http://s3-us-west-2.amazonaws.com/boundaries.latimes.com/archive/1.0/boundary-set/census-tracts-2012.geojson'
shape_data = requests.get(test_url).json()

In [353]:
census_tract_df = pd.DataFrame()

census_tract_df['tract'] = [feature['properties']['name'] for feature in shape_data['features']]
census_tract_df['shape'] = [shape(feature['geometry']) for feature in shape_data['features']]
census_tract_df

Unnamed: 0,tract,shape
0,06037101110,"(POLYGON ((-118.302291 34.258697, -118.300787 ..."
1,06037101122,"(POLYGON ((-118.303334 34.273536, -118.303178 ..."
2,06037101210,"(POLYGON ((-118.299451 34.255978, -118.285924 ..."
3,06037101220,"(POLYGON ((-118.285924 34.248959, -118.285924 ..."
4,06037101300,"(POLYGON ((-118.272473 34.232527, -118.271936 ..."
...,...,...
2341,06037980031,"(POLYGON ((-118.285303 33.708598, -118.283369 ..."
2342,06037980033,"(POLYGON ((-118.244627 33.710767, -118.231803 ..."
2343,06037990100,"(POLYGON ((-118.951142 33.996432, -118.950564 ..."
2344,06037990200,"(POLYGON ((-118.631676 34.000011, -118.635977 ..."


In [354]:
sensor_points_df = pd.DataFrame()
sensor_points_df['point'] = la_county_sensors_df.apply(lambda x: Point(x['Lon'], x['Lat']), axis=1)
sensor_points_df

Unnamed: 0_level_0,point
ID,Unnamed: 1_level_1
407,POINT (-118.33936 33.84261)
417,POINT (-118.309906 33.716675)
489,POINT (-118.26094 33.79555)
565,POINT (-118.28797 33.709614)
567,POINT (-118.06222 34.12679)
...,...
70793,POINT (-118.40904 34.009174)
71409,POINT (-118.125946 34.191765)
72249,POINT (-118.295 34.11458)
72541,POINT (-118.47709 33.99846)


In [361]:
census_distances = []
for census_index, census_row in census_tract_df.iterrows():
    distances = []
    for sensor_index, sensor_row in sensor_points_df.iterrows():
        distances.append((sensor_index, census_row['shape'].distance(sensor_row['point'])))
    distances.sort(key=lambda x: x[1])
    census_distances.append(distances[:4])
census_tract_df['distances'] = census_distances

In [362]:
census_tract_df['distances'] = census_distances
census_tract_df

Unnamed: 0,tract,shape,distances
0,06037101110,"(POLYGON ((-118.302291 34.258697, -118.300787 ...","[(67111, 0.011165129869373657), (6420, 0.01742..."
1,06037101122,"(POLYGON ((-118.303334 34.273536, -118.303178 ...","[(67111, 0.004781749156943835), (64765, 0.0148..."
2,06037101210,"(POLYGON ((-118.299451 34.255978, -118.285924 ...","[(67111, 0.011773999999988407), (6420, 0.02091..."
3,06037101220,"(POLYGON ((-118.285924 34.248959, -118.285924 ...","[(67111, 0.0027674638208957884), (63387, 0.014..."
4,06037101300,"(POLYGON ((-118.272473 34.232527, -118.271936 ...","[(67111, 0.0), (63387, 0.0023240824666152767),..."
...,...,...,...
2341,06037980031,"(POLYGON ((-118.285303 33.708598, -118.283369 ...","[(565, 0.0025986465656384627), (23945, 0.00457..."
2342,06037980033,"(POLYGON ((-118.244627 33.710767, -118.231803 ...","[(22317, 0.0), (57517, 0.0), (47959, 0.0070564..."
2343,06037990100,"(POLYGON ((-118.951142 33.996432, -118.950564 ...","[(20819, 0.004159033377153688), (26107, 0.0043..."
2344,06037990200,"(POLYGON ((-118.631676 34.000011, -118.635977 ...","[(37129, 0.004480631400035367), (14973, 0.0052..."


In [383]:
census_tract_df['idx_closest_sensor'] = census_tract_df.apply(lambda row: row['distances'][0][0], axis=1)
census_tract_df

Unnamed: 0,tract,shape,distances,idx_closest_sensor
0,06037101110,"(POLYGON ((-118.302291 34.258697, -118.300787 ...","[(67111, 0.011165129869373657), (6420, 0.01742...",67111
1,06037101122,"(POLYGON ((-118.303334 34.273536, -118.303178 ...","[(67111, 0.004781749156943835), (64765, 0.0148...",67111
2,06037101210,"(POLYGON ((-118.299451 34.255978, -118.285924 ...","[(67111, 0.011773999999988407), (6420, 0.02091...",67111
3,06037101220,"(POLYGON ((-118.285924 34.248959, -118.285924 ...","[(67111, 0.0027674638208957884), (63387, 0.014...",67111
4,06037101300,"(POLYGON ((-118.272473 34.232527, -118.271936 ...","[(67111, 0.0), (63387, 0.0023240824666152767),...",67111
...,...,...,...,...
2341,06037980031,"(POLYGON ((-118.285303 33.708598, -118.283369 ...","[(565, 0.0025986465656384627), (23945, 0.00457...",565
2342,06037980033,"(POLYGON ((-118.244627 33.710767, -118.231803 ...","[(22317, 0.0), (57517, 0.0), (47959, 0.0070564...",22317
2343,06037990100,"(POLYGON ((-118.951142 33.996432, -118.950564 ...","[(20819, 0.004159033377153688), (26107, 0.0043...",20819
2344,06037990200,"(POLYGON ((-118.631676 34.000011, -118.635977 ...","[(37129, 0.004480631400035367), (14973, 0.0052...",37129


#### Census Tract data is provided by LATimes (not ideal), but is complete.

In [276]:
# Iterate through sensors, checking which shape object the sensor's lat/lon falls within
ind_cen = {}
for index, row in la_county_sensors_df.iterrows():
    # Build Point object from sensor's Lon and Lat (in that order!) values
    point = Point(row.Lon, row.Lat)
    
    for tract, polygon in polygons.items():
        if polygon.contains(point):
            ind_cen[index] = tract

In [277]:
[i for i in la_county_sensors_df.index if i not in ind_cen.keys()]

[]

#### Different approach by querying fcc's database to get census tract
#### 3.5 minutes to query for 560 locations

In [267]:
# 3.5 minutes to query for 560 locations
%%time
tracts = {}
for index, row in la_county_sensors_df.iterrows():
    url = f'https://geo.fcc.gov/api/census/area?lat={row.Lat}&lon={row.Lon}&format=json'
    results = requests.get(url).json()
    tracts[index] = results['results'][0]['block_fips'][:11]

CPU times: user 10.4 s, sys: 587 ms, total: 11 s
Wall time: 3min 34s


In [288]:
for index, census in ind_cen.items():
    if census != tracts[index]:
        print('LATIMES: ', ind_cen[index], 'QUERY: ', tracts[index])

LATIMES:  06037297500 QUERY:  06037297300
LATIMES:  06037464100 QUERY:  06037464200
LATIMES:  06037650300 QUERY:  06037620601
