## To scrape pollution data from India govt dashboard

from https://www.listendata.com/2022/07/pollution-in-india-real-time-aqi-data.html

In [19]:
import requests
import json
import pandas as pd
import re
import datetime
import time
import base64
from itertools import product

In [20]:
# The following api should work to test (can only get 10 requests at a time with this)
api = '579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b'

# for full data access, create an account with https://data.gov.in/
# then in dashboard, go to `my account` and create api

In [25]:
# from https://www.listendata.com/2022/07/pollution-in-india-real-time-aqi-data.html
stationsData = pd.read_csv('./data/PM25/stations.txt')

def getData(api, filters):
    url1 = "https://api.data.gov.in/resource/3b01bcb8-0b14-4abf-b6f2-c1bfd384ba69?api-key=" + api + "&format=json&limit=500"
    criteriaAll = [[(k, re.sub(r'\s+', '%20', v)) for v in criteria[k]] for k in criteria]
    url2 = [url1 + ''.join(f'&filters[{ls}]={value}' for ls, value in p) for p in product(*criteriaAll)]
    
    pollutionDfAll = pd.DataFrame()
    for i in url2:
        response = requests.get(i, verify=True)
        response_dict = json.loads(response.text)
        pollutionDf = pd.DataFrame(response_dict['records'])
        pollutionDfAll = pd.concat([pollutionDfAll, pollutionDf])
    
    return pollutionDfAll

def get_data_cpcb(id, dt):
    
    datetime2 = dt.strftime('%Y-%m-%dT%H:%M:%SZ')
    
    key  = '{"station_id":"' + id + '","date":"' + datetime2 + '"}'
    body = base64.b64encode(key.encode()).decode()
    
    timeZoneoffset = int((datetime.datetime.utcnow() - datetime.datetime.now()).total_seconds()/60)
    token = '{"time":' + str(int(time.time())) + ',"timeZoneOffset":'+ str(timeZoneoffset ) +'}'
    accessToken = base64.b64encode(str(token).encode()).decode()
    
    headers = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accesstoken': accessToken,
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'origin': 'https://app.cpcbccr.com',
        'referer': 'https://app.cpcbccr.com/AQI_India/',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'accept-language': 'en-US,en;q=0.9'
    }
    
    response = requests.post('https://app.cpcbccr.com/aqi_dashboard/aqi_all_Parameters', headers=headers, data=body, verify=True)
    response_dict = json.loads(response.text)
    print(response_dict)
    info = pd.DataFrame({'title':response_dict['title'], 'date':response_dict['date']}, index=[0])
    pollutionDf = pd.concat([pd.DataFrame([response_dict['aqi']]), info], axis=1)    
    pollutants  = pd.concat([pd.DataFrame(response_dict['metrics']), info], axis=1)
    
    return pollutionDf, pollutants

In [26]:
# Copy of list of sites at ./data/PM25/stations.txt
# example:
# 'site_1549,"Hardev Nagar, Bathinda - PPCB",74.907758,30.233011,TRUE,78,Bathinda,Punjab'

## To get current data:

In [31]:
criteria = {'city':["Hardev Nagar, Bathinda - PPCB"], 'pollutant_id': ["PM10", "PM2.5"]}
mydata = getData(api, criteria)
mydata.head()

Unnamed: 0,id,country,state,city,station,last_update,pollutant_id,pollutant_min,pollutant_max,pollutant_avg
0,2273,India,Punjab,Bathinda,"Hardev Nagar, Bathinda - PPCB",07-12-2023 14:00:00,PM10,62,136,98
1,1037,India,Haryana,Yamuna Nagar,"Gobind Pura, Yamuna Nagar - HSPCB",07-12-2023 14:00:00,PM10,119,292,152
0,2272,India,Punjab,Bathinda,"Hardev Nagar, Bathinda - PPCB",07-12-2023 14:00:00,PM2.5,52,135,76
1,1036,India,Haryana,Yamuna Nagar,"Gobind Pura, Yamuna Nagar - HSPCB",07-12-2023 14:00:00,PM2.5,104,333,174


## To get historic data:
Note, this calls the Central Pollution Control Board (CPCB) dashboard at: http://app.cpcbccr.com/ccr/#/caaqm-dashboard-all/caaqm-landing  In theory, one can download data from this site directly (many authors report doing so). At the time of writing this (7 Dec 2023), this site is not working, thus the following code did not work. 

In [32]:
id = 'Hardev Nagar, Bathinda - PPCB'
summary, pollutants = get_data_cpcb(id, datetime.datetime(2019, 10, 1, 18))

{'detail': 'unauthorized request'}


KeyError: 'title'