# Querying LondonAir for pollution data

# 0. Set-up

In [1]:
## import liabrairies
import pandas as pd
import numpy as np
import re
import requests
import json

# 1. Functions

In [2]:
def request_json(link):
    response = requests.get(link)
    rep_json = response.text
    parsed = json.loads(rep_json)
    return parsed

In [3]:
## Function to read measures given by the API

def get_measures(json):

    # Create empty lists for storage
    SpeciesCode = []
    MeasurementDateGMT = []
    Value = []
    
    # Loop through the json file
    for i in range(len(json['AirQualityData']['Data'])):
        root_data = json['AirQualityData']['Data']
        SpeciesCode.append(root_data[i]['@SpeciesCode'])
        MeasurementDateGMT.append(root_data[i]['@MeasurementDateGMT'])
        Value.append(root_data[i]['@Value'])
    
    # Create final dataframe
    df_measures = pd.DataFrame(np.column_stack([SpeciesCode, MeasurementDateGMT, Value]),
                                   columns=['SpeciesCode', 'MeasurementDateGMT', 'Value'])
    
    # Formatting - insert side code, rearrange columns and replace empty strings
    df_measures['Sitecode'] = json['AirQualityData']['@SiteCode']
    df_measures = df_measures[['Sitecode', 'MeasurementDateGMT', 'SpeciesCode', 'Value']]
    df_measures['Value'] = df_measures['Value'].replace('', 0)
    
    return df_measures

In [4]:
## Function to get measures for all sites between a certain timeframe

def get_record(sites, start_date, end_date):
    
    #Create empty dataframe
    df_record = pd.DataFrame()
    
    # Loop through list of sites
    for i in sites:
        site_request = ('/Data/Site/SiteCode=' + i + '/StartDate=' + start_date + '/EndDate=' + end_date + '/Json')
        link_request = base_api + site_request
        json_response = request_json(link_request)
        df_site = get_measures(json_response)
        df_record = df_record.append(df_site)
        
    return df_record

# 2. Execution

In [5]:
base_api = 'https://api.erg.ic.ac.uk/AirQuality'

In [6]:
## Hard-coded sites to query - to be improved at later stage via dynamic query based on mapped subway stations
sites_to_query = ['TH4', 'BQ7', 'EA8', 'EI8', 'EI3', 'CR9', 'GB6', 'CT3', 'HG4', 'EA6',
                  'HI0', 'ST9', 'CR5', 'CT2', 'EN4', 'EN5', 'IS6', 'HP1', 'MY1', 'WAB',
                  'CT8', 'ST6', 'CT4', 'RB7', 'BX1', 'RI1', 'WA9', 'WAA', 'KC1', 'LW1',
                  'GN0', 'BG2', 'ST5', 'ME9', 'CD1', 'BX2', 'SKA', 'KT4', 'TH2', 'BY7',
                  'LW4', 'GV2', 'CT6', 'BT8', 'HV3', 'EN7', 'LB4', 'LW2', 'HG1', 'RB4',
                  'HR1', 'EN1', 'HR2', 'IS2', 'WMB', 'WMC', 'ST4', 'HV1', 'IM1', 'WM5',
                  'GR8', 'GN4', 'WAC', 'BT5', 'ME2', 'GR7', 'BG1', 'HK6', 'LW5', 'WA2',
                  'WA7', 'SK6', 'GR9', 'BL0', 'LH0', 'TD5', 'LB6', 'GV1', 'RI2', 'KT5',
                  'BT6', 'GN5', 'WM6', 'CR8', 'KT6', 'LB5', 'EI1', 'GN6', 'SK5', 'WM0',
                  'CR7', 'BT4', 'NB1', 'WMD']

In [7]:
## Query LondonAir API for all sites closed to the subway stations, for the past 7 days

from datetime import datetime, timedelta

nbr_day_query = 1
end = datetime.today()
end_date = end.strftime('%Y-%m-%d')

start = end - timedelta(days= nbr_day_query)
start_date = start.strftime('%Y-%m-%d')

site_list = sites_to_query

In [8]:
record_1 = get_record(site_list, start_date, end_date)
record_1 = record_1.reset_index(drop = True)

# 3. Output

In [9]:
record_1

Unnamed: 0,Sitecode,MeasurementDateGMT,SpeciesCode,Value
0,TH4,2021-03-16 00:00:00,NO2,41.4
1,TH4,2021-03-16 01:00:00,NO2,34.8
2,TH4,2021-03-16 02:00:00,NO2,28.6
3,TH4,2021-03-16 03:00:00,NO2,28.6
4,TH4,2021-03-16 04:00:00,NO2,33.3
...,...,...,...,...
6547,WMD,2021-03-16 19:00:00,PM25,13
6548,WMD,2021-03-16 20:00:00,PM25,12
6549,WMD,2021-03-16 21:00:00,PM25,10
6550,WMD,2021-03-16 22:00:00,PM25,11
