# Scraping Air Temperature and Rainfall Data from Data.gov.sg API

Developer: Ong Chin Hwee

Language: Python 3.7.4

This data scraping script is developed as a personal project to scrap NEA 
meteorological data from Data.gov.sg APIs. The project initiator (@hweecat) has 
active plans to expand this personal project to scrap data from other NEA Dataset 
APIs. 

Currently, this script is able to scrap data from the following APIs:
1. Realtime Weather Readings across Singapore

    a. Air Temperature across Singapore

    b. Rainfall across Singapore

This script is currently being actively updated to include scraping from other NEA dataset APIs.

## Extracting data from Data.gov.sg API

First, import the required libraries:

In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle

# parse API output
import requests
import json

# timezone
import pytz

# process management
from tqdm import trange
from tqdm import tqdm
from time import sleep

from retrying import retry

In [2]:
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_airtemp_data_from_date(date):
    url = "https://api.data.gov.sg/v1/environment/air-temperature?date=" + str(date) # for daily API request
    JSONContent = requests.get(url).json()
    content = json.dumps(JSONContent, indent = 4, sort_keys=True)
    try:
        json_retrieved = (
            content[content.find("items")+7:content.find("metadata")-13] + ']'
            ).replace(" ", "").replace("\n", "")
        df_retrieved = pd.read_json(json_retrieved, orient="columns")
        print("Data for " + str(date) + " scraped!")
        return df_retrieved
    except:
        print("Data for " + str(date) + " empty!")
        return pd.DataFrame({'readings' : [], 'timestamp' : []})


In [3]:
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_rainfall_data_from_date(date):
    url = "https://api.data.gov.sg/v1/environment/rainfall?date=" + str(date) # for daily API request
    JSONContent = requests.get(url).json()
    content = json.dumps(JSONContent, indent = 4, sort_keys=True)
    try:
        json_retrieved = (
            content[content.find("items")+7:content.find("metadata")-13] + ']'
            ).replace(" ", "").replace("\n", "")
        df_retrieved = pd.read_json(json_retrieved, orient="columns")    
        print("Data for " + str(date) + " scraped!")
        return df_retrieved
    except:
        print("Data for " + str(date) + " empty!")
        return pd.DataFrame({'readings' : [], 'timestamp' : []})


In [4]:
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_relative_humidity_data_from_date(date):
    url = "https://api.data.gov.sg/v1/environment/relative-humidity?date=" + str(date) # for daily API request
    JSONContent = requests.get(url).json()
    content = json.dumps(JSONContent, indent = 4, sort_keys=True)
    try:
        json_retrieved = (
            content[content.find("items")+7:content.find("metadata")-13] + ']'
            ).replace(" ", "").replace("\n", "")
        df_retrieved = pd.read_json(json_retrieved, orient="columns")
        print("Data for " + str(date) + " scraped!")
        return df_retrieved
    except:
        print("Data for " + str(date) + " empty!")
        return pd.DataFrame({'readings' : [], 'timestamp' : []})


In [5]:
def get_data_from_date_range(date_range, data_type):
    df_date_list = []
    for date in date_range:
        try:
            if data_type == 'air-temperature':
                df_date = get_airtemp_data_from_date(str(date))
            elif data_type == 'rainfall':
                df_date = get_rainfall_data_from_date(str(date))
            elif data_type == 'relative-humidity':
                df_date = get_relative_humidity_data_from_date(str(date))
        except ValueError:
            continue
        df_date_list.append(df_date)
        sleep(0.1)
    return pd.concat(df_date_list).reset_index()

In [6]:
def get_device_id(date, data_type):
    url = "https://api.data.gov.sg/v1/environment/" + data_type + "?date=" + str(date) # for daily API request
    JSONContent = requests.get(url).json()
    content = json.dumps(JSONContent, indent = 4, sort_keys=True)
    json_device_id = content[
        content.find("stations")+10:-3
        ].replace(" ", "").replace("\n", "")
    df_device_id = pd.read_json(json_device_id, orient="object")
    return df_device_id

In [7]:
def df_data_from_json_to_df(reading):
    df_to_append = pd.DataFrame(list(df_data['readings'][reading]))

    # fill in null values for station ids without reading values for some timestamps
    for station_id in list(df_device_id['id']):
        if station_id not in list(df_to_append['station_id']):
            df_to_append = df_to_append.append(
                pd.DataFrame(
                    {"station_id":[station_id],
                    "value": [np.nan]}
                    )
                )      
    df_to_append_null_filled = df_to_append.reset_index().drop(columns=['index']).reset_index(drop=True)

    return df_to_append_null_filled

For pandas version 0.24.2 or earlier:

In [8]:
def utc_to_local(dt):
    local_tz = pytz.timezone('UTC')
    dt = local_tz.localize(dt)
    target_tz = pytz.timezone('Asia/Singapore')
    dt = target_tz.normalize(dt).replace(tzinfo=None)
    return dt

def remove_tzinfo(dt):
    return dt.replace(tzinfo=None)

In [9]:
date_entry = input('Enter a date in YYYY-MM-DD format: ')
# date_entry = '2019-04-20'
date_time_str = str(date_entry)
try:        
    base = datetime.datetime.strptime(date_time_str, '%Y-%m-%d').date()
    if base > datetime.datetime.now().date():
        print('Date input is in the future.')
        raise ValueError
except ValueError:
    print('Date input is not valid. Defaulting to current date.')
    base = datetime.datetime.now().date()
    date_list = [base]
else:
    numdays_entry = input('Enter number of days from date entered: ')
    numdays = int(numdays_entry)
    try:
        date_list = [base + datetime.timedelta(days=x) for x in range(numdays)]
        if date_list[-1] > datetime.datetime.now().date():
            print('Date range goes into the future.')
            raise ValueError
    except ValueError:
        print('Date range input is not valid. Defaulting to input date.')
        date_list = [base + datetime.timedelta(days=x)
        for x in range(int((datetime.datetime.now().date() - base).days+1))]

Enter a date in YYYY-MM-DD format: 2019-05-01
Enter number of days from date entered: 7


In [10]:
# Initialize type of data to extract from NEA data API
datatype_entry = input(
'Choose type of data to extract from API - \
1. air temperature 2. rainfall 3. relative humidity: ')
# datatype_entry = 3
datatype_choice = int(datatype_entry)
while (datatype_choice):
    if datatype_choice == 1:
        data_type = 'air-temperature'
        break
    elif datatype_choice == 2:
        data_type = 'rainfall'
        break
    elif datatype_choice == 3:
        data_type = 'relative-humidity'
        break
    else:
        datatype_entry = input('Invalid input.\
Please choose type of data to extract from API \
- 1. air temperature 2. rainfall: ')
        datatype_choice = int(datatype_entry)

Choose type of data to extract from API - 1. air temperature 2. rainfall 3. relative humidity: 3


In [11]:
# Extract daily data from Data.gov.sg API
# for a defined date range, represented in JSON format
try:
    df_data = get_data_from_date_range(date_list, data_type)
except Exception as e:
    print(e)

Data for 2019-05-01 scraped!
Data for 2019-05-02 scraped!
Data for 2019-05-03 scraped!
Data for 2019-05-04 scraped!
Data for 2019-05-05 scraped!
Data for 2019-05-06 scraped!
Data for 2019-05-07 scraped!


In [12]:
df_data

Unnamed: 0,index,readings,timestamp
0,0,"[{'station_id': 'S109', 'value': 86}, {'statio...",2019-05-01 00:01:00+08:00
1,1,"[{'station_id': 'S109', 'value': 86.3}, {'stat...",2019-05-01 00:02:00+08:00
2,2,"[{'station_id': 'S109', 'value': 86.5}, {'stat...",2019-05-01 00:03:00+08:00
3,3,"[{'station_id': 'S109', 'value': 86.8}, {'stat...",2019-05-01 00:04:00+08:00
4,4,"[{'station_id': 'S109', 'value': 87}, {'statio...",2019-05-01 00:05:00+08:00
...,...,...,...
10058,1433,"[{'station_id': 'S109', 'value': 74.8}, {'stat...",2019-05-07 23:55:00+08:00
10059,1434,"[{'station_id': 'S109', 'value': 75.1}, {'stat...",2019-05-07 23:56:00+08:00
10060,1435,"[{'station_id': 'S109', 'value': 75.5}, {'stat...",2019-05-07 23:57:00+08:00
10061,1436,"[{'station_id': 'S109', 'value': 75.5}, {'stat...",2019-05-07 23:58:00+08:00


In [13]:
# Get device ID dataframe
df_device_id = pd.concat(
    [get_device_id(date, data_type) for date in tqdm(date_list)])
with open('df_device_id.pickle', 'wb') as f:
    pickle.dump(df_device_id, f)

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:20<00:00,  2.86s/it]


In [14]:
df_device_id

Unnamed: 0,device_id,id,location,name
0,S109,S109,"{'latitude': 1.3764, 'longitude': 103.8492}",AngMoKioAvenue5
1,S117,S117,"{'latitude': 1.256, 'longitude': 103.679}",BanyanRoad
2,S50,S50,"{'latitude': 1.3336999999999999, 'longitude': ...",ClementiRoad
3,S107,S107,"{'latitude': 1.3135, 'longitude': 103.9625}",EastCoastParkway
4,S43,S43,"{'latitude': 1.3399, 'longitude': 103.8878}",KimChuanRoad
...,...,...,...,...
11,S116,S116,"{'latitude': 1.2810000000000001, 'longitude': ...",WestCoastHighway
12,S104,S104,"{'latitude': 1.44387, 'longitude': 103.78538}",WoodlandsAvenue9
13,S100,S100,"{'latitude': 1.4172, 'longitude': 103.74855}",WoodlandsRoad
14,S122,S122,"{'latitude': 1.41731, 'longitude': 103.8249}",SembawangRoad


In [15]:
# Create list of station IDs
device_id_list = set(
    df_device_id[['device_id', 'id']].set_index('device_id')['id'])

device_id_list

{'S100',
 'S104',
 'S106',
 'S107',
 'S108',
 'S109',
 'S111',
 'S116',
 'S117',
 'S121',
 'S122',
 'S24',
 'S43',
 'S44',
 'S50',
 'S60'}

In [16]:
# Create dictionary of station IDs for switch case to initialize station ID
device_id_dict = {id:id for id in device_id_list}

device_id_dict

{'S121': 'S121',
 'S24': 'S24',
 'S106': 'S106',
 'S116': 'S116',
 'S109': 'S109',
 'S108': 'S108',
 'S107': 'S107',
 'S43': 'S43',
 'S111': 'S111',
 'S117': 'S117',
 'S122': 'S122',
 'S100': 'S100',
 'S44': 'S44',
 'S104': 'S104',
 'S60': 'S60',
 'S50': 'S50'}

In [17]:
# create list of dataframes containing extracted reading values converted from JSON format
df_reading = [df_data_from_json_to_df(reading)
    for reading in trange(len(df_data['readings']))]

  

100%|████████████████████████████████████████████████████████████████████████████| 10063/10063 [03:30<00:00, 47.79it/s]


In [18]:
# concatenate dataframes in list within date range    
df_extracted = pd.concat(df_reading)

df_extracted

Unnamed: 0,station_id,value
0,S109,86.0
1,S117,83.7
2,S50,82.4
3,S107,89.6
4,S43,83.7
...,...,...
11,S24,84.4
12,S116,85.2
13,S104,74.3
14,S100,73.0


In [None]:
# Initialize device ID to select from extracted data
stationid_entry = input('The station IDs are: \n' + \
    str(list(device_id_dict.keys())) + \
    '\nChoose station ID to extract data from: ')
stationid_choice = str(stationid_entry)
while True:
    stationid_choice = device_id_dict.get(stationid_choice, None)
    if stationid_choice == None:
        stationid_entry = input('Invalid station ID. \
Please choose station ID to extract data from: ')
        stationid_choice = str(stationid_entry)
    else:
        break

In [None]:
# extract sensor readings for a specific station id
df_extracted_stationid = df_extracted[
    df_extracted['station_id']==stationid_choice
    ].reset_index(drop=True)
df_extracted_cleaned = pd.concat(
    [df_data, df_extracted_stationid], axis=1
    ).drop(columns=['readings'])

In [None]:
df_extracted_stationid

In [None]:
df_extracted_cleaned

For pandas version 0.24.2 or earlier, timestamp needs to be converted from default UTC timezone to SGT timezone:

In [None]:
# Convert from UTC Time to SGT Time
if not int((pd.__version__).split('.')[1]) >= 25:
    df_extracted_cleaned['timestamp'] = \
    [utc_to_local(dt) for dt in df_extracted_cleaned['timestamp']]

In [None]:
# write to CSV
df_extracted_cleaned.to_csv(
    data_type + '/nea_' + data_type + '_' + stationid_choice + \
    '_from_' + str(date_list[0]) + '_to_' + str(date_list[-1]) + \
    '.csv')
print('Data extraction complete!')