In [3]:
import requests
import json
import subprocess
from functools import partial
import os
import pandas as pd
import re
from datetime import datetime, timedelta
from functools import partial

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.7 Safari/537.36'
}
 
subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")

In [5]:
def get_idx():
    """
    Retrieves a list of station indices ('idx') from the World Air Quality Index API.
    
    :return: A list of 'idx' values for the monitoring stations.
    """
    url = 'https://api.waqi.info/mapq2/bounds'
    data = {
        'bounds': "-179.9999,-89.9999,179.9999,89.9999",
        'country': "",
        'inc': "placeholders",
        'viewer': "webgl",
        'zoom': 2
    }
    request = requests.post(url, data=data, headers=headers).text
    a = json.loads(request)
    result = []
    for j in a["data"]:
        idx = j['idx']
        if idx.isdigit():
            result.append(idx)
    return result

In [6]:
def get_py_json(url):
    '''
    Fetches and extracts encrypted data from a given URL.
    
    :param url: URL of a specific region to fetch the data from.
    :return: A list of extracted encrypted data messages. Returns an empty list if no data is found.
    '''
    resp = requests.get(url, headers=headers).text
    lis = resp.split('\n\n')
    result = []
    for i in range(1, len(lis) - 1, 2):
        st = lis[i][18:]
        dic = json.loads(st)
        if 'msg' in dic:
            result.append(dic["msg"])
    if result:
        return result[1:]
    return result

In [17]:
# Decode functions extracted from JavaScript

def decode_a(encoded_str, start_timestamp, delta_hours):
    e, i, r, o = 0, 0, 0, 1
    a = []

    def s(t, r):
        for _ in range(max(r, 1)):
            nonlocal e, i
            e += 1
            i += t
            datetime_val = datetime.fromtimestamp(start_timestamp) + timedelta(hours=e * delta_hours)
            a.append({'t': datetime_val, 'v': i * o})

    def c(n):
        return ord(encoded_str[n])

    def d(t):
        return 48 <= c(t) <= 57

    def parse_next():
        nonlocal l
        t, n = 0, 1
        if c(l + 1) == 45:
            n = -1
            l += 1
        while d(l + 1):
            t = 10 * t + (c(l + 1) - 48)
            l += 1
        if c(l + 1) == 46:
            l += 1
        return n * t

    l = 0
    while l < len(encoded_str):
        h = c(l)
        if l == 0 and h == 42:
            o = 1 / parse_next()
            l += 1
        elif h in [36, 37, 39]:
            e += h - 35
        elif h == 47:
            o = parse_next()
            l += 1
        elif h == 33:
            s(parse_next(), r)
            r = 0
        elif h == 124:
            e += parse_next() - 1
        elif 65 <= h <= 90:
            s(h - 65, r)
            r = 0
        elif 97 <= h <= 122:
            s(-(h - 97) - 1, r)
            r = 0
        elif 48 <= h <= 57:
            r = 10 * r + h - 48
        else:
            raise ValueError(f"decode: invalid character {chr(h)} ({encoded_str[l]}) at {l}")

        l += 1

    return a

def decode_s(encoded_str, start_timestamp, delta_hours, pol_name):
    if not encoded_str:
        return None

    try:
        if encoded_str[0] == "1":
            values = decode_a(encoded_str[1:], start_timestamp, delta_hours)
            return {"name": pol_name, "values": values}
        elif encoded_str[0] == "2":
            raise NotImplementedError("Decoding for '2' is not implemented yet")
        else:
            raise ValueError("Unknown encoding type")
    except Exception as e:
        print(f"decode error: {e}")
        return None

In [36]:
def get_index_data(items):
    '''
    Process the decoded data and arrange it into a structured format.
    
    :param items: List of dictionaries containing decoded data for each pollutant.
    :return: A list of lists containing structured data and the city name.
    '''
    pollutants_data = { 'pm25': [], 'pm10': [], 'o3': [], 'no2': [], 'so2': [], 'co': [] }

    for item in items:
        name = item['name']
        for value in item['values']:
            pollutants_data[name].append((value['t'], value['v']))

    combined_data = []
    for pol in pollutants_data:
        for time_val in pollutants_data[pol]:
            combined_data.append((time_val[0], pol, time_val[1]))
    combined_data.sort(key=lambda x: x[0])

    data_list = [['time', 'pm25', 'pm10', 'O3', 'NO2', 'SO2', 'CO']]
    unique_dates = sorted(list(set([data[0] for data in combined_data])))
    for date in unique_dates:
        row = [date.strftime("%Y-%m-%d")]
        for pol in ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']:
            value = next((x[2] for x in combined_data if x[0] == date and x[1] == pol), '')
            row.append(value)
        data_list.append(row)

    return data_list

In [51]:
# Example input data
input_data = {
    "now": "2024-01-04T09:42:14Z",
    "st": 473352,
    "ps": {
        "co": "1|0Haca",
        "no2": "1|0Ga2B",
        "pm10": "1|0!28kEA",
        "pm25": "1!79!-28Ee",
        "so2": "1|0.2ADB"
    },
    "dh": 24,
    "time": {
        "span": ["2024-01-04T00:00:00Z", "2024-01-04T00:00:00Z"]
    },
    "meta": {
        "si": {
            "sources": [
                {
                    "name": "Citizen Weather Observer Program (CWOP/APRS)",
                    "url": "http://wxqa.com/",
                    "pols": ["weather"],
                    "logo": ""
                },
                {
                    "name": "Guiyang Municipal Environmental Protection Bureau (贵阳市环境保护局)",
                    "url": "http://www.ghb.gov.cn/",
                    "pols": None,
                    "logo": ""
                }
            ],
            "city": {
                "name": "CN:mep/贵阳/马鞍山",
                "idx": 1368
            },
            "timezone": "8.00"
        },
        "dt": "0ms"
    }
}

city_name = input_data['meta']['si']['city']['name']

# Convert start time to timestamp
start_time_str = input_data['time']['span'][0].replace("Z", "+00:00")
start_timestamp = datetime.fromisoformat(start_time_str).timestamp() - (3 * 24 * 3600)

# Extract delta hours
delta_hours = input_data['dh']

# Decode the data and get structured data and city name
decoded_items = [decode_s(value, start_timestamp, delta_hours, key) for key, value in input_data['ps'].items()]
data_list= get_index_data(decoded_items)

# Displaying the first few rows of the data list for illustration
data_list[:5]

[['time', 'pm25', 'pm10', 'O3', 'NO2', 'SO2', 'CO'],
 ['2024-01-01', '', 28, '', 6, 0, 7],
 ['2024-01-02', 79, 17, '', 5, 0, 6],
 ['2024-01-03', 51, 21, '', 6, 3, 3],
 ['2024-01-04', 55, 21, '', 7, 4, 2]]

In [57]:
def get_decoded_data_list(encoded_data_list):
    """
    Processes a list of encoded air quality data and decodes each entry.

    :param encoded_data_list: A list of dictionaries, each containing encoded data for a specific region.
    :return: A list of decoded data for each region in the input list.
    """
    decoded_data_list = []
    for encoded_data in encoded_data_list:
        try:
            start_time_str = encoded_data['time']['span'][0].replace("Z", "+00:00")
            start_timestamp = datetime.fromisoformat(start_time_str).timestamp() - (3 * 24 * 3600)

            # Extract delta hours
            delta_hours = encoded_data['dh']

            # Decode the data and get structured data and city name
            decoded_items = [decode_s(value, start_timestamp, delta_hours, key) for key, value in encoded_data['ps'].items()]
            decoded_data_list.append(decoded_items)
        except Exception as e:
            print(f"Error: {e}")
    return decoded_data_list

In [34]:
def get_index_data_list(decoded_data_list):
    """
    Processes a list of decoded air quality data, structuring each item into a more readable format.

    :param decoded_data_list: A list of decoded data, where each item is a list of dictionaries with decoded values for a region.
    :return: A list of structured data for each region in the input list.
    """
    index_data_list = []
    for decoded_data in decoded_data_list:
        try:
            index_data_list.append(get_index_data(decoded_data))
        except Exception as e:
            print(f"Error: {e}")
    return index_data_list

In [101]:
def write_file(city, data_lists):
    # Combine all data lists
    combined_data = []
    for data in data_lists:
        # Exclude headers and filter data by date range
        for row in data[1:]:
            date_str = row[0]
            date_obj = datetime.strptime(date_str, "%Y-%m-%d")
            if datetime(2021, 1, 1) <= date_obj <= datetime(2023, 12, 31):
                combined_data.append(row)

    # Sort the combined data by date
    combined_data.sort(key=lambda x: x[0])

    # Add headers
    headers = data_lists[0][0]  # Assuming all data lists have the same headers

    # Replace invalid characters in city name and create directory path
    lst = re.sub(r'[?、 .╲*"<>|,]', '_', city).replace(":", "/").split("/")
    city_1 = '/'
    for i in range(0, len(lst) - 1):
        city_1 += lst[i] + '/'
    path = os.path.join("data/waqi" + city_1)

    # Create directory if it does not exist
    if not os.path.exists(path) or not os.path.isdir(path):
        os.makedirs(path)
    # Save to CSV
    file_name = os.path.join(path + lst[-1] + '.csv')
    print(file_name)
    df = pd.DataFrame(combined_data, columns=headers)
    df.to_csv(file_name, encoding='utf-8', index=False)


In [102]:
# Get a list of indices for air quality monitoring stations
idx = get_idx()

# Iterate through each index
for i in idx:
    # Construct the URL for fetching encoded data
    url = f'https://api.waqi.info/api/attsse/{i}/yd.json'

    # Read previously processed URLs from a file to avoid duplication
    with open('./data/waqi/url.txt', 'r', encoding='utf-8') as f:
        line = f.read().splitlines()
    
    # Process the URL only if it's not already processed
    if url not in line:
        try:
            # Fetching encoded data from the URL
            encoded_data_list = get_py_json(url)

            # Extracting city name from the first item in the encoded data list
            city_name = encoded_data_list[0]['meta']['si']['city']['name']

            # Decoding the fetched data
            decoded_data_list = get_decoded_data_list(encoded_data_list)

            # Structuring the decoded data for readability and analysis
            data_list = get_index_data_list(decoded_data_list)

            # Output the city name
            print(city_name)

            # Writing the structured data to a CSV file
            write_file(city_name, data_list)

            # Recording the processed URL in a file
            with open('./data/waqi/url.txt', 'a', encoding='utf-8') as f:
                f.write(f'{url}\n')

        except Exception as e:
            # Print any errors encountered during the process
            print(e)


decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
decode error: string index out of range
