In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

from shapely.geometry import shape, Point, Polygon
from shapely.ops import unary_union

import rtree as rt

%matplotlib inline
pd.set_option("display.max.columns", None)

## TODOs:
- Retitle columns to be useful
- Find more complete census tract geojson/one that isn't hosted by latimes is liable to go away at any moment
- How to handle tracts with multiple sensors in them?
- Determine flow:
    - Census tracts won't change, but each sensor can go offline
    - How often to collect PurpleAir data
        - Entire pipeline needs to run at each collection to ensure current/accurate sensors
    - Census tract polygon object gets mapped to a nearest PurpleAir sensor
        - How to make nearest neighbor calculation as efficient as possible
    - How to rebuild geojson/arcgis object from dataframes
    - Where to host processed data?
    - How to package data for dashboard

### PurpleAir Data descriptors
##### Determined
- pm = current PM2.5 reading
- pm1 = raw PM1 reading
- pm_10 = raw PM10 reading
- pm_0 = current PM2.5 reading
- pm_1 = 10 minute PM2.5 average
- pm_2 = 30 minute PM2.5 average
- pm_3 = 1 hour PM2.5 average
- pm_4 = 6 hour PM2.5 average
- pm_5 = 24 hour PM2.5 average
- pm_6 = One week PM2.5 average
- p1 = Particles >= 0.3 µm
- p2 = Particles >= 0.5 µm
- p3 = Particles >= 1.0 µm
- p4 = Particles >= 2.5 µm
- p5 = Particles >= 5.0 µm
- p6 = Particles >= 10.0 µm
- flags = Data flagged for unusually high readings
- age = Sensor data age (when data was last received) in minutes
- isOwner = Currently logged in user is the sensor owner
- Adc = The voltage reading on the analog input of the control board

##### Undetermined
- conf
- Icon
- Voc
- Ozone1
- CH

In [2]:
# Returns dataframe of active purpleair sensors contained within LA County
def los_angeles_county_sensors(sensors_df):
    # Populate rtree index with sensor points
    idx = rt.index.Index()
    
    # Get LA County geometries in JSON form
    url = 'https://opendata.arcgis.com/datasets/10f1e37c065347e693cf4e8ee753c09b_15.geojson'
    shapes = requests.get(url).json()
    
    la_county_indeces = []
    
    # Union of individual LA County shapes. Speeds up check if sensor is in LAC.
    polygons = [shape(feature['geometry']) for feature in shapes['features']]
    lac_shape = unary_union(polygons)
    
    # Iterate through purpleair sensors
    for index, row in sensors_df.iterrows():
        if ~np.isnan(row.Lon) and ~np.isnan(row.Lat):
            # Build Point object from sensor's Lon and Lat (in that order!) values
            point = Point(row.Lon, row.Lat)
            
            idx.insert(int(index), point.bounds, point)
            
    # get list of fids where bounding boxes intersect
    fids = [int(i) for i in idx.intersection(lac_shape.bounds)]

#     # access the features that those fids reference
    for fid in fids:
        point = Point(sensors_df.loc[fid].Lon, sensors_df.loc[fid].Lat)
        # check the geometries intersect, not just their bboxs
        if lac_shape.contains(point):
            la_county_indeces.append(fid)
                    
                
    return sensors_df.loc[la_county_indeces]

In [3]:
# Use 'experimental' data from purpleair
url = 'https://www.purpleair.com/data.json'
data = requests.get(url).json()

sensors_df = pd.DataFrame(data['data'], columns=data['fields'])
sensors_df = sensors_df.set_index('ID')

cols_to_keep = ['pm', 'pm1', 'pm_10', 'age', 'Lat', 'Lon']
cols_to_drop = [col for col in sensors_df.columns if col not in cols_to_keep]

sensors_df = sensors_df.drop(cols_to_drop, axis=1)

In [4]:
# Get purpleair sensors in LA County
la_county_sensors_df = los_angeles_county_sensors(sensors_df)
la_county_sensors_df

Unnamed: 0_level_0,pm,age,pm1,pm_10,Lat,Lon
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28973,64.8,0,35.4,74.7,33.943050,-117.998170
6898,66.6,0,39.7,75.8,33.980698,-117.953700
9368,16.6,32903,9.2,19.7,34.001434,-117.831310
28547,51.7,1,31.2,65.9,34.009304,-117.876620
1656,36.6,10186,23.1,44.4,34.017160,-117.981575
...,...,...,...,...,...,...
37129,52.8,1,31.0,65.8,34.024400,-118.513020
62461,61.3,1,34.7,72.9,34.150840,-118.519990
64617,6.3,0,3.5,7.2,34.150673,-118.519806
17771,28.0,1,19.6,30.4,34.718820,-118.290700


#### Census Tract data is provided by LATimes (not ideal), but is complete.

In [5]:
test_url = 'http://s3-us-west-2.amazonaws.com/boundaries.latimes.com/archive/1.0/boundary-set/census-tracts-2012.geojson'
shape_data = requests.get(test_url).json()

In [6]:
census_tract_df = pd.DataFrame()

census_tract_df['tract'] = [int(feature['properties']['name']) for feature in shape_data['features']]
census_tract_df['shape'] = [shape(feature['geometry']) for feature in shape_data['features']]
census_tract_df.set_index('tract', inplace=True)
census_tract_df

Unnamed: 0_level_0,shape
tract,Unnamed: 1_level_1
6037101110,"(POLYGON ((-118.302291 34.258697, -118.300787 ..."
6037101122,"(POLYGON ((-118.303334 34.273536, -118.303178 ..."
6037101210,"(POLYGON ((-118.299451 34.255978, -118.285924 ..."
6037101220,"(POLYGON ((-118.285924 34.248959, -118.285924 ..."
6037101300,"(POLYGON ((-118.272473 34.232527, -118.271936 ..."
...,...
6037980031,"(POLYGON ((-118.285303 33.708598, -118.283369 ..."
6037980033,"(POLYGON ((-118.244627 33.710767, -118.231803 ..."
6037990100,"(POLYGON ((-118.951142 33.996432, -118.950564 ..."
6037990200,"(POLYGON ((-118.631676 34.000011, -118.635977 ..."


In [7]:
sensor_points_df = pd.DataFrame()
sensor_points_df['point'] = la_county_sensors_df.apply(lambda x: Point(x['Lon'], x['Lat']), axis=1)
sensor_points_df

Unnamed: 0_level_0,point
ID,Unnamed: 1_level_1
28973,POINT (-117.99817 33.94305)
6898,POINT (-117.9537 33.980698)
9368,POINT (-117.83131 34.001434)
28547,POINT (-117.87662 34.009304)
1656,POINT (-117.981575 34.01716)
...,...
37129,POINT (-118.51302 34.0244)
62461,POINT (-118.51999 34.15084)
64617,POINT (-118.519806 34.150673)
17771,POINT (-118.2907 34.71882)


In [8]:
# Populate rtree index with sensor points
idx = rt.index.Index()
for index, row in sensor_points_df.iterrows():
    idx.insert(int(index), row['point'].bounds)

In [9]:
# Use rtree index's nearest function to find indexed point closest to the bounding box of the census tract
# Could easily get the n closest sensors to calculate an average
nearest_sensor = []
for index, row in census_tract_df.iterrows():
    geometry = row['shape']
    
    nearest = list(idx.nearest((geometry.bounds), 1))[0]
    nearest_sensor.append((index, nearest))
nearest_sensor

[(6037101110, 71813),
 (6037101122, 71813),
 (6037101210, 67111),
 (6037101220, 67111),
 (6037101300, 67111),
 (6037101400, 67111),
 (6037102103, 73229),
 (6037102104, 36587),
 (6037102105, 20443),
 (6037102107, 73229),
 (6037103101, 71813),
 (6037103102, 6420),
 (6037103200, 6420),
 (6037103300, 73229),
 (6037103400, 6420),
 (6037104103, 46705),
 (6037104105, 46705),
 (6037104108, 46705),
 (6037104124, 73229),
 (6037104201, 46705),
 (6037104203, 46705),
 (6037104204, 46705),
 (6037104310, 46705),
 (6037104320, 46705),
 (6037104401, 46705),
 (6037104403, 46705),
 (6037104404, 46705),
 (6037104500, 46705),
 (6037104610, 46705),
 (6037104620, 46705),
 (6037104701, 46705),
 (6037104703, 46705),
 (6037104704, 46705),
 (6037104810, 46705),
 (6037104821, 46705),
 (6037104822, 46705),
 (6037106010, 37551),
 (6037106020, 37551),
 (6037106111, 46705),
 (6037106112, 46705),
 (6037106113, 46705),
 (6037106114, 46705),
 (6037106403, 37551),
 (6037106405, 37551),
 (6037106406, 37551),
 (6037106407,

#### Census Tract data is provided by LATimes (not ideal), but is complete.

In [10]:
# # Iterate through sensors, checking which shape object the sensor's lat/lon falls within
# ind_cen = {}
# for index, row in la_county_sensors_df.iterrows():
#     # Build Point object from sensor's Lon and Lat (in that order!) values
#     point = Point(row.Lon, row.Lat)
    
#     for tract, polygon in polygons.items():
#         if polygon.contains(point):
#             ind_cen[index] = tract

In [11]:
# [i for i in la_county_sensors_df.index if i not in ind_cen.keys()]

#### Different approach by querying fcc's database to get census tract
#### 3.5 minutes to query for 560 locations

In [12]:
# # 3.5 minutes to query for 560 locations
# %%time
# tracts = {}
# for index, row in la_county_sensors_df.iterrows():
#     url = f'https://geo.fcc.gov/api/census/area?lat={row.Lat}&lon={row.Lon}&format=json'
#     results = requests.get(url).json()
#     tracts[index] = results['results'][0]['block_fips'][:11]

In [13]:
# for index, census in ind_cen.items():
#     if census != tracts[index]:
#         print('LATIMES: ', ind_cen[index], 'QUERY: ', tracts[index])