In [1]:
import os 
import requests
from bs4 import BeautifulSoup
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Comment out API keys to avoid running cells by accident

# api read key created at https://develop.purpleair.com/keys
api_read_key = "1A4A4B7E-BF7D-11EE-8616-42010A80000B"

# api write key created at https://develop.purpleair.com/keys
api_write_key = "FFAF2E2E-BF85-11EE-8616-42010A80000B"

In [3]:
# open variables json and assign to p_air_vars variable
with open('purple-air-variables.json') as f:
    p_air_vars = json.load(f)

# pull in the list of test sensors as sensor_test
sensor_test = p_air_vars['sensor_test']
sensor_test

[53, 77, 81, 443, 820]

In [4]:
# create function to pull all data from a sensor
def pull_sensor_data(sensor_id, p_air_READ_API_key):
    url = f"https://api.purpleair.com/v1/sensors/{sensor_id}"
    headers = {
        "X-API-Key": p_air_READ_API_key
        }
    
    sensor_response = requests.get(url, headers=headers)
    sensor_response_json = sensor_response.json()
    sensor_data = sensor_response_json.get('sensor')
    return sensor_data

In [5]:
list_of_test_sensors = []

for sensor in sensor_test:
    sensor_data = pull_sensor_data(sensor, api_read_key)
    list_of_test_sensors.append(sensor_data)

In [6]:
sample_sensor_df = pd.DataFrame(list_of_test_sensors)
sample_sensor_df

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,private,is_owner,name,icon,location_type,model,...,5.0_um_count_b,10.0_um_count_b,pm1.0_cf_1_b,pm1.0_atm_b,pm2.5_atm_b,pm2.5_cf_1_b,pm10.0_atm_b,pm10.0_cf_1_b,stats_b,altitude
0,53,1520025982,1454548891,1707077195,0,0,Lakeshore,0,0,UNKNOWN,...,,,,,,,,,,
1,77,1575074907,1456896339,1707077203,0,0,Sunnyside,0,0,PA-I,...,,,,,,,,,,
2,81,1465680292,1465657200,1707077203,0,0,Sherwood Hills 2,0,0,UNKNOWN,...,,,,,,,,,,
3,443,1559966508,1478491864,1707077121,0,0,Weber-Morgan Health Department P1,0,0,PA-II,...,0.0,0.0,6.21,6.21,6.44,6.44,6.44,6.44,"{'pm2.5': 6.4, 'pm2.5_10minute': 7.0, 'pm2.5_3...",
4,820,1575003401,1483643179,1707077255,0,0,Granite Basement,0,1,PA-II,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'pm2.5': 0.0, 'pm2.5_10minute': 0.0, 'pm2.5_3...",5131.0


In [7]:
sample_sensor_df['last_modified'] = pd.to_datetime(sample_sensor_df['last_modified'], unit='s')
sample_sensor_df['date_created'] = pd.to_datetime(sample_sensor_df['date_created'], unit='s')
sample_sensor_df['last_seen'] = pd.to_datetime(sample_sensor_df['last_seen'], unit='s')

In [8]:
# commented out below, .csv exists in repo after first run, no longer needed

#sample_sensor_df.to_csv("sample_sensors.csv")

# Use all sensor data below

In [9]:
# pull in the list of ALL SLC sensors as slc_sensor_list
slc_sensor_list = p_air_vars['slc_sensor_list']


In [10]:
## EXPENSIVE cell. Pulls all 100+ columns for all 600+ sensors in SLC, outdoor and indoor
## The results are saved to slc_sensor_data_20240130.csv

# pull all SLC sensors into a list
# Cost of cell: 95480
# Time to run: 4 minutes

#list_of_slc_sensors = []

#for sensor in slc_sensor_list:
#    sensor_data = pull_sensor_data(sensor, api_read_key)
#    list_of_slc_sensors.append(sensor_data)

In [11]:
#slc_sensors_df = pd.DataFrame(list_of_slc_sensors)
#slc_sensors_df.to_csv('slc_sensor_data_20240130.csv')

In [12]:
# read in SLC sensor data from 1/30/24 snapshot
slc_sensor_eda = pd.read_csv('slc_sensor_data_20240130.csv')


In [13]:
slc_sensor_eda.describe()

Unnamed: 0.1,Unnamed: 0,sensor_index,last_modified,date_created,last_seen,private,is_owner,icon,location_type,led_brightness,...,pm2.5_cf_1_b,pm10.0_atm_b,pm10.0_cf_1_b,altitude,ozone1,humidity_b,temperature_b,pressure_b,voc,voc_b
count,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,...,487.0,487.0,487.0,605.0,1.0,56.0,56.0,56.0,56.0,56.0
mean,307.5,72891.12013,1619314000.0,1586838000.0,1706636000.0,0.0,0.0,0.0,0.258117,35.357143,...,197.918234,140.514045,200.471335,4932.745455,145.0,25.482143,72.482143,856.775893,117.473214,117.4775
std,177.968162,60211.10269,45827850.0,55733700.0,42181.51,0.0,0.0,0.0,0.437954,7.914321,...,934.066822,621.099381,933.62679,796.948542,,10.421989,14.322254,20.808866,113.629944,113.626979
min,0.0,53.0,1465680000.0,1454549000.0,1706164000.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,4219.0,145.0,11.0,43.0,798.91,50.0,50.0
25%,153.75,16260.0,1575398000.0,1537884000.0,1706641000.0,0.0,0.0,0.0,0.0,35.0,...,6.035,7.2,7.2,4418.0,145.0,18.0,61.0,849.665,56.925,56.9225
50%,307.5,49683.0,1626983000.0,1583598000.0,1706641000.0,0.0,0.0,0.0,0.0,35.0,...,21.02,23.05,23.05,4652.0,145.0,24.5,70.0,863.955,69.25,69.25
75%,461.25,122074.5,1647643000.0,1629497000.0,1706641000.0,0.0,0.0,0.0,1.0,35.0,...,32.06,35.335,35.335,5075.0,145.0,29.0,86.0,869.9925,114.55,114.54
max,615.0,208783.0,1706563000.0,1703862000.0,1706641000.0,0.0,0.0,0.0,1.0,100.0,...,6091.4,4060.25,6091.4,8804.0,145.0,65.0,93.0,880.02,532.4,532.37


In [14]:
slc_sensor_eda.nunique()

Unnamed: 0       616
sensor_index     616
last_modified    616
date_created     616
last_seen        278
                ... 
humidity_b        27
temperature_b     33
pressure_b        55
voc               53
voc_b             54
Length: 110, dtype: int64

In [15]:
# look at value counts for each column
# skip the first two columns since we know they'll all be unique
for column in slc_sensor_eda.columns[2:]:
    value_counts = slc_sensor_eda[column].value_counts()
    print(f'Column: {column}\n{value_counts}\n')

Column: last_modified
1520025982    1
1630852908    1
1629328369    1
1631251784    1
1631251712    1
             ..
1616983651    1
1548026904    1
1575088844    1
1575004554    1
1706498344    1
Name: last_modified, Length: 616, dtype: int64

Column: date_created
1454548891    1
1625008888    1
1624396549    1
1624482883    1
1624482885    1
             ..
1547074682    1
1547228302    1
1547232395    1
1547745418    1
1703862440    1
Name: date_created, Length: 616, dtype: int64

Column: last_seen
1706641222    9
1706641173    7
1706641183    7
1706641257    7
1706641272    7
             ..
1706641121    1
1706641041    1
1706641178    1
1706641202    1
1706641396    1
Name: last_seen, Length: 278, dtype: int64

Column: private
0    616
Name: private, dtype: int64

Column: is_owner
0    616
Name: is_owner, dtype: int64

Column: name
Home                  3
Wasatch Hollow        2
Riverton              2
Buckboard             2
Payson Utah           2
                     ..
Bonne

In [16]:
slc_sensor_eda['last_modified'] = pd.to_datetime(slc_sensor_eda['last_modified'], unit='s')
slc_sensor_eda['date_created'] = pd.to_datetime(slc_sensor_eda['date_created'], unit='s')
slc_sensor_eda['last_seen'] = pd.to_datetime(slc_sensor_eda['last_seen'], unit='s')

slc_sensor_eda.head()

Unnamed: 0.1,Unnamed: 0,sensor_index,last_modified,date_created,last_seen,private,is_owner,name,icon,location_type,...,pm10.0_atm_b,pm10.0_cf_1_b,stats_b,altitude,ozone1,humidity_b,temperature_b,pressure_b,voc,voc_b
0,0,53,2018-03-02 21:26:22,2016-02-04 01:21:31,2024-01-30 18:59:33,0,0,Lakeshore,0,0,...,,,,,,,,,,
1,1,77,2019-11-30 00:48:27,2016-03-02 05:25:39,2024-01-30 18:59:30,0,0,Sunnyside,0,0,...,,,,,,,,,,
2,2,81,2016-06-11 21:24:52,2016-06-11 15:00:00,2024-01-30 18:59:22,0,0,Sherwood Hills 2,0,0,...,,,,,,,,,,
3,3,443,2019-06-08 04:01:48,2016-11-07 04:11:04,2024-01-30 18:58:00,0,0,Weber-Morgan Health Department P1,0,0,...,10.02,10.02,"{'pm2.5': 9.8, 'pm2.5_10minute': 7.8, 'pm2.5_3...",,,,,,,
4,4,459,2022-05-29 03:50:54,2016-11-07 18:32:48,2024-01-30 18:58:27,0,0,Flight Park North 2,0,0,...,7.5,7.5,"{'pm2.5': 6.8, 'pm2.5_10minute': 9.4, 'pm2.5_3...",5122.0,,,,,,


In [17]:
oldest_sensor = slc_sensor_eda['date_created'].min()
oldest_sensor

Timestamp('2016-02-04 01:21:31')