## Import necessary packages

In [129]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

import time

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import os

import requests

LOCATION = {
    'BKK': (13.729984, 100.536443),
    'Chiangmai': (18.840633, 98.969661),
    'Khonkaen': (16.445329, 102.835251),
    'Rayong': (12.671521, 101.275875),
    'Saraburi': (14.685833, 100.871996),
    'Surat': (9.126057, 99.325355)
}    

## Scraping for Wind @850 hPa and Temp @Surface

In [123]:
def get_url(datetime, coordinate, mode):
    '''
    This function returns appropriate URL for scraping data with given date/time, coordinate, and mode (wind, temp)
    [Input]
        datetime - datetime python object (Thailand local time zone) YYYY-MM-DD HH-MM-SS
        coordiante - a tuple of (latitude, longitude)
        mode - wind (for wind speed and direction at 850 hPa) / temp (for surface temperature)
    [Output]
        url - appropriate url used for further scraping
    '''
    latitude, longtitude = str(coordinate[0]), str(coordinate[1])
    
    #transform local datetime (GMT+7) to UTC datetime
    local_datetime = datetime - timedelta(hours = 7)
    
    #time need to be in format HHMM
    hour = str(local_datetime.hour) if local_datetime.hour >= 10 else "0" + str(local_datetime.hour)
    minute = str(local_datetime.minute) if local_datetime.minute >= 10 else "0" + str(local_datetime.minute)
    time = hour + minute
    
    #month
    month = str(local_datetime.month) if local_datetime.month >= 10 else "0" + str(local_datetime.month)
    
    #date
    date = str(local_datetime.day) if local_datetime.day >= 10 else "0" + str(local_datetime.day)
    
    year = local_datetime.year
    
    if mode == "wind":
        url = f'https://earth.nullschool.net/#{year}/{month}/{date}/{time}Z/wind/isobaric/850hPa/loc={longtitude},{latitude}'
    elif mode == "temp":
        url = f'https://earth.nullschool.net/#{year}/{month}/{date}/{time}Z/wind/surface/level/overlay=temp/loc={longtitude},{latitude}'
    
    return url


def extract_data(data, mode):
    if (mode == "wind"):
        data = data.split()
        wind_direction = int(data[0][:-1])
        wind_speed = int(data[2])
        return (wind_speed, wind_direction)
    elif (mode == "temp"):
        return data

def save_result_from_scraping(df, mode):
#     timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
#     path = f'./{mode}_scraped_data_{timestamp}.csv'
    path = f'./{mode}_scraped_data.csv'
    
    df.to_csv(path, index = False)
    print(f"Lastest Data --> Date: {df.iloc[-1]['date_time']} saved to {path}")


def scrape_data(start_dt, end_dt, location_codes, mode):
    # start a webdriver
    service = ChromeService(executable_path=ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    
    # get the starting URL
    url = get_url(start_dt, LOCATION[location_codes[0]], mode)

    # navigate the driver to the starting URL
    driver.get(url = url)
    time.sleep(5) #wait for web page to load
    
    # ==========================================================================================================
    # click hamburger button
    hamburger_button = driver.find_element(By.CSS_SELECTOR, 'button.card.no-touch-tt')
    hamburger_button.click()
    
    # locate where the next button, data area, date area
    next_button = driver.find_element(By.CSS_SELECTOR, 'button[data-name="nav-next1"]')
    
    if (mode == "wind"):
        data_area = driver.find_element(By.CSS_SELECTOR, 'div#spotlight-panel div[data-name="spotlight-a"]')
    elif (mode == "temp"):
        data_area = driver.find_element(By.CSS_SELECTOR, 'div#spotlight-panel div[data-name="spotlight-b"] div')
    
    date_area = driver.find_element(By.CSS_SELECTOR, 'div[data-name="date-field"]')
    
    status_area = driver.find_element(By.CSS_SELECTOR, 'div[data-name="status-card"] div.field')
    
    # column name for dataframe
    column_names = ['date_time']
    if (mode == "wind"):
        for location in location_codes:
            column_names.append(f"wind_speed_{location}")
            column_names.append(f"wind_direction_{location}")
    elif (mode == "temp"):
        for location in location_codes:
            column_names.append(f"temp_{location}")
    
    # ==========================================================================================================
    
    ALL_DATA = []
    counter = 0
    
    while (True):
        dt = datetime.strptime(date_area.text, '%Y-%m-%d %H:%M Local')
        if (dt > end_dt):
            break
        
        data_ts = [dt]
        
        # go through all the locations 
        for location in location_codes:
            url = get_url(dt, LOCATION[location], mode)
            driver.get(url = url)
            data = extract_data(data_area.text, mode)
            
            if (mode == "wind"):
                data_ts.append(data[0]) #wind_speed
                data_ts.append(data[1]) #wind_direction
            elif (mode == "temp"):
                data_ts.append(data) #temp
                       
        next_button.click()
        ALL_DATA.append(data_ts)
        counter += 1
        
        if (counter%50 == 0):
            df = pd.DataFrame(ALL_DATA, columns = column_names)
            save_result_from_scraping(df, mode)
            
        while (True): #wait for the information to load finish first
            if (status_area.text == ""):
                break
                
    print("====================================")
    print("FINISH SCRAPING")
    df = pd.DataFrame(ALL_DATA, columns = column_names)
    save_result_from_scraping(df, mode)

    driver.quit()
    return df

In [124]:
start_year, start_month, start_date, start_hour = 2020, 7, 1, 0
end_year, end_month, end_date, end_hour = 2021, 7, 1, 23

START_DT = datetime(start_year, start_month, start_date, start_hour)
END_DT = datetime(end_year, end_month, end_date, end_hour)

LOCS = ["BKK", "Chiangmai", "Khonkaen", "Rayong", "Saraburi", "Surat"]

# scrape for wind
ALL_DATA_WIND = scrape_data(START_DT, END_DT, LOCS, "wind")



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/james/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Lastest Data --> Date: 2020-07-07 01:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-07-13 07:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-07-19 13:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-07-25 01:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-07-31 07:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-08-06 13:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-08-12 19:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-08-19 01:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-08-25 07:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-08-31 13:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-09-06 19:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-09-13 01:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2020-09-19 07:00:00 saved to ./wind_scraped_data.csv
Lastest Data

Lastest Data --> Date: 2021-06-15 15:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-17 17:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-19 19:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-21 21:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-23 23:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-26 01:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-28 03:00:00 saved to ./wind_scraped_data.csv
Lastest Data --> Date: 2021-06-30 05:00:00 saved to ./wind_scraped_data.csv
FINISH SCRAPING
Lastest Data --> Date: 2021-07-01 23:00:00 saved to ./wind_scraped_data.csv


In [125]:
ALL_DATA_WIND

Unnamed: 0,date_time,wind_speed_BKK,wind_direction_BKK,wind_speed_Chiangmai,wind_direction_Chiangmai,wind_speed_Khonkaen,wind_direction_Khonkaen,wind_speed_Rayong,wind_direction_Rayong,wind_speed_Saraburi,wind_direction_Saraburi,wind_speed_Surat,wind_direction_Surat
0,2020-06-30 22:00:00,35,270,17,225,21,230,47,270,41,250,23,260
1,2020-07-01 01:00:00,42,260,24,265,37,240,41,260,41,255,21,255
2,2020-07-01 04:00:00,55,270,23,280,39,255,46,260,49,265,29,255
3,2020-07-01 07:00:00,53,250,18,265,36,260,40,255,52,260,35,250
4,2020-07-01 10:00:00,51,260,22,265,34,265,38,250,61,260,36,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5837,2021-07-01 19:00:00,18,250,23,250,31,225,25,255,23,240,9,245
5838,2021-07-01 20:00:00,21,255,25,250,31,225,26,255,24,235,9,245
5839,2021-07-01 21:00:00,20,250,26,255,30,225,26,250,22,240,9,235
5840,2021-07-01 22:00:00,16,245,27,260,28,220,24,245,25,240,9,235


In [127]:
# scrape for wind
ALL_DATA_TEMP = scrape_data(START_DT, END_DT, LOCS, "temp")



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/james/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Lastest Data --> Date: 2020-07-07 01:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-07-13 07:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-07-19 13:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-07-25 19:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-08-01 01:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-08-07 07:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-08-13 13:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-08-19 19:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-08-26 01:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-09-01 07:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-09-07 13:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-09-13 19:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2020-09-20 01:00:00 saved to ./temp_scraped_data.csv
Lastest Data

Lastest Data --> Date: 2021-06-15 21:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-17 23:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-20 01:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-22 03:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-24 05:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-26 07:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-28 09:00:00 saved to ./temp_scraped_data.csv
Lastest Data --> Date: 2021-06-30 11:00:00 saved to ./temp_scraped_data.csv
FINISH SCRAPING
Lastest Data --> Date: 2021-07-01 23:00:00 saved to ./temp_scraped_data.csv


In [128]:
ALL_DATA_TEMP

Unnamed: 0,date_time,temp_BKK,temp_Chiangmai,temp_Khonkaen,temp_Rayong,temp_Saraburi,temp_Surat
0,2020-06-30 22:00:00,29.4,25.2,27.6,28.5,25.9,25.9
1,2020-07-01 01:00:00,28.5,24.0,26.7,28.2,25.5,25.2
2,2020-07-01 04:00:00,28.3,23.4,25.5,27.8,25.1,24.7
3,2020-07-01 07:00:00,28.7,24.1,26.0,28.2,25.8,25.4
4,2020-07-01 10:00:00,29.7,29.7,29.4,29.6,28.7,29.3
...,...,...,...,...,...,...,...
5831,2021-07-01 19:00:00,29.1,24.6,30.7,28.5,29.7,26.8
5832,2021-07-01 20:00:00,29.5,24.1,29.8,28.3,28.0,26.2
5833,2021-07-01 21:00:00,29.8,23.9,28.9,28.2,27.0,25.9
5834,2021-07-01 22:00:00,29.1,23.5,28.0,28.2,26.0,25.7


## Scraping for Hotspot
Daily in 5 countries (Thailand, Myanmar, Cambodia, Lao_PDR, P_Malaysia)

Retrieved From: http://asmc.asean.org/asmc-haze-hotspot-daily-new/#Hotspot

In [247]:
REGIONS = ['Thailand', 'Myanmar', 'Cambodia', 'Lao_PDR', 'P_Malaysia'] #P_Malaysia = Peninsula Malaysia

def scrape_hot_spot_data():
    year_s, month_s, date_s = 2017, 7, 1
    year_e, month_e, date_e = 2021, 7, 1

    url = 'http://asmc.asean.org/wp-content/themes/asmctheme/page-functions/functions-ajax-haze-daily-hotspot-count-new.php'
    
    daynights = ['day', 'night']
    #========================================================================================================

    date_s = datetime(year_s, month_s, date_s)
    date_e = datetime(year_e, month_e, date_e)
    print(f'Start scraping daily hotspot from date {date_s.strftime("%-d %b, %Y")} to date {date_e.strftime("%-d %b, %Y")}')

    dfs = {}
    for daynight in daynights:
        print(f'--> Scraping {daynight} time...')

        date = (date_e + timedelta(days = 1)).strftime("%-d %b, %Y")
        pastDays = (date_e - date_s).days + 1
        if (daynight == "night"):
            pastDays += 1

        data = {'date': date,
                'pastDays': pastDays,
                'regions[]': REGIONS,
                'daynight': daynight,
                'conf': 'High'} #confidence level
        response = requests.post(url, data = data).json()

        DATA = []
        for d in response:
            date = datetime.strptime(d['date'], '%Y-%m-%d')
            DATA.append([date] + [d[r] for r in REGIONS])

        #column names
        column_names = ['date']
        for r in REGIONS:
            column_names.append(f'hotspot_{r}_{daynight}'.lower())

        #create dataframe
        df = pd.DataFrame(DATA, columns = column_names)
        df.sort_values(by = 'date', inplace = True)
        df.reset_index(drop = True, inplace = True)

        dfs[daynight] = df
    print('\nFinish!')
    return dfs

In [248]:
# Combine hot spot at day time and night time together

dfs = scrape_hot_spot_data()

df_h = pd.merge(dfs['day'], dfs['night'], on = 'date', how = 'outer')
df_h.sort_values(by = 'date', inplace = True)
df_h.reset_index(drop = True, inplace = True)

# rearrange columns
n_regions = len(dfs['day'].columns) - 1
new_column_names = ['date']
for r in REGIONS:
    new_column_names.append(f'hotspot_{r}_day'.lower())
    new_column_names.append(f'hotspot_{r}_night'.lower())

df_h = df_h[new_column_names]
df_h

Start scraping daily hotspot from date 1 Jul, 2017 to date 1 Jul, 2021
--> Scraping day time...
--> Scraping night time...

Finish!


Unnamed: 0,date,hotspot_thailand_day,hotspot_thailand_night,hotspot_myanmar_day,hotspot_myanmar_night,hotspot_cambodia_day,hotspot_cambodia_night,hotspot_lao_pdr_day,hotspot_lao_pdr_night,hotspot_p_malaysia_day,hotspot_p_malaysia_night
0,2017-06-30,,0.0,,0.0,,0.0,,0.0,,0.0
1,2017-07-01,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0
2,2017-07-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-07-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-07-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1458,2021-06-27,6.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,2.0,0.0
1459,2021-06-28,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0
1460,2021-06-29,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0
1461,2021-06-30,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0


In [249]:
# save into csv
path = './hotspot_scraped_data.csv'
df_h.to_csv(path, index = False)