### Import necessary packages

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

import time

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import os

import requests

### Constant

In [2]:
LOCS = ["BKK", "Chiangmai", "Khonkaen", "Rayong", "Saraburi", "Surat"]

#(latitude, longitude) for all provinces
LOCATION = {
    'BKK': (13.729984, 100.536443),
    'Chiangmai': (18.840633, 98.969661),
    'Khonkaen': (16.445329, 102.835251),
    'Rayong': (12.671521, 101.275875),
    'Saraburi': (14.685833, 100.871996),
    'Surat': (9.126057, 99.325355)
}

## General Functions

In [3]:
def save_result_from_scraping(df, mode):
    '''
    This function receives dataframe containing datas and save into folder ./scraped_data with given mode
    [Input]
        df - pandas DataFrame
        mode - "wind", "temp", "hotspot"
    '''
    #check if path exists (if not create it)
    base_path = './scraped_data'
    if not os.path.exists(base_path):
        os.mkdir(base_path)
        
    #drop duplicates if exists
    if (df.duplicated().sum() != 0):
        print(f'Dropping {df.duplicated().sum()} duplicated rows')
        df.drop_duplicates(inplace = True)
    
    #save dataframe
    path = f'./{base_path}/{mode}_scraped_data.csv'
    df.to_csv(path, index = False)
    
def get_current_dt():
    current_dt = datetime.now()
    return current_dt.strftime('%Y-%m-%d %H:%M:%S')

## Functions involved for scraping Wind (@850 hPa) and Temp (@Surface) Data

In [12]:
def get_url(datetime, coordinate, mode):
    '''
    This function returns appropriate URL for scraping data with given date/time, coordinate, and mode (wind, temp)
    [Input]
        datetime - datetime python object (Thailand local time zone) YYYY-MM-DD HH-MM-SS
        coordiante - a tuple of (latitude, longitude)
        mode - wind (for wind speed and direction at 850 hPa) / temp (for surface temperature)
    [Output]
        url - appropriate url used for further scraping
    '''
    
    latitude, longtitude = str(coordinate[0]), str(coordinate[1])
    
    #transform local datetime (GMT+7) to UTC datetime
    local_datetime = datetime - timedelta(hours = 7)
    
    #time need to be in format HHMM
    hour = str(local_datetime.hour) if local_datetime.hour >= 10 else "0" + str(local_datetime.hour)
    minute = str(local_datetime.minute) if local_datetime.minute >= 10 else "0" + str(local_datetime.minute)
    time = hour + minute
    
    #month
    month = str(local_datetime.month) if local_datetime.month >= 10 else "0" + str(local_datetime.month)
    
    #date
    date = str(local_datetime.day) if local_datetime.day >= 10 else "0" + str(local_datetime.day)
    
    year = local_datetime.year
    
    if mode == "wind":
        url = f'https://earth.nullschool.net/#{year}/{month}/{date}/{time}Z/wind/isobaric/850hPa/loc={longtitude},{latitude}'
    elif mode == "temp":
        url = f'https://earth.nullschool.net/#{year}/{month}/{date}/{time}Z/wind/surface/level/overlay=temp/loc={longtitude},{latitude}'
    
    return url

def scrape_wind_or_temp_data(start_dt, end_dt, location_codes, mode):
    '''
    This function will launch a webdriver to scrape the data which will be gathered into a DataFrame
    saved in '/scraped_data/{mode}_raw.csv'
    [Input]
        start_dt, end_dt - datetime python object
        location_codes - list of location names to scrape
        mode - wind (for wind speed and direction at 850 hPa) / temp (for surface temperature)
    [Output]
        df - dataframe of scrape data
    '''
    
    # [Initiating Scraping] ====================================================================================
    print(f"Starting to scrape {mode} data, from {start_dt} to {end_dt}")
    print(f"Interested Areas: {location_codes}")
    
    print(f"\n[{get_current_dt()}] --> Initiating webdriver and variables involved...")
    
    # start a webdriver
    service = ChromeService(executable_path=ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    
    # get the starting URL
    url = get_url(start_dt, LOCATION[location_codes[0]], mode)

    # navigate the driver to the starting URL
    driver.get(url = url)
    time.sleep(10) #wait for web page to load
    
    # click hamburger button
    hamburger_button = driver.find_element(By.CSS_SELECTOR, 'button.card.no-touch-tt')
    hamburger_button.click()
    
    # locate where the next button, data area, date area
    next_button = driver.find_element(By.CSS_SELECTOR, 'button[data-name="nav-next1"]')
    
    if (mode == "wind"):
        data_area = driver.find_element(By.CSS_SELECTOR, 'div#spotlight-panel div[data-name="spotlight-a"]')
    elif (mode == "temp"):
        data_area = driver.find_element(By.CSS_SELECTOR, 'div#spotlight-panel div[data-name="spotlight-b"] div')
    
    date_area = driver.find_element(By.CSS_SELECTOR, 'div[data-name="date-field"]')
    
    status_area = driver.find_element(By.CSS_SELECTOR, 'div[data-name="status-card"] div.field')
    
    # column names for dataframe
    column_names = ['date_time']
    if (mode == "wind"):
        for location in location_codes:
            column_names.append(f"wind_speed_{location}")
            column_names.append(f"wind_direction_{location}")
    elif (mode == "temp"):
        for location in location_codes:
            column_names.append(f"temp_{location}")
            
    print(f"[{get_current_dt()}] --> Finish Initiating, Starting to scrape...")
    # ==========================================================================================================
    
    ALL_DATA = [] #this dataframe contains all the data scraped
    counter = 0 #this counter is used to trigger saving data while scraping
    
    while (True):
        dt = datetime.strptime(date_area.text, '%Y-%m-%d %H:%M Local')
        if (dt > end_dt): #if the dt read from the web is greater than end_dt, stop scraping
            break
        
        data_ts = [dt] #list of data for each timestep
        
        # go through all the locations 
        for location in location_codes:
            url = get_url(dt, LOCATION[location], mode) #get the url
            driver.get(url = url)
            data = data_area.text
            
            if (mode == "wind"):
                data = data.split()
                wind_direction = int(data[0][:-1])
                wind_speed = int(data[2])
                data_ts.append(wind_speed) 
                data_ts.append(wind_direction)
            elif (mode == "temp"):
                data_ts.append(data) #temp
                
            time.sleep(0.05) #prevent scraping too fast 
                       
        next_button.click() #go to next timestep
        ALL_DATA.append(data_ts)
        
        counter += 1
        if (counter%50 == 0): #for every 50 data entries scraped, we will saved
            df = pd.DataFrame(ALL_DATA, columns = column_names)
            print(f"[{get_current_dt()}] --> Saving Latest Data - Timestamp: {df.iloc[-1]['date_time']}")
            save_result_from_scraping(df, mode)
            
        while (True): #wait for the information to finish loading first
            if (status_area.text == "Data download failed"):
                driver.refresh()
                print(f"[{get_current_dt()}] --> Data download failed occurs, refreshing the driver...")
                time.sleep(10)
                print(f"[{get_current_dt()}] --> finish refreshing")
                break
            
            if (status_area.text == ""):
                time.sleep(0.1) #prevent scraping too fast 
                break
                
    driver.quit() #close the webdriver
                
    print(f"[{get_current_dt()}] --> Finish Scraping! Webdriver closed and Saving Final Result...")
    
    # save the final result
    df = pd.DataFrame(ALL_DATA, columns = column_names)
    save_result_from_scraping(df, mode)
    
    print(f"[{get_current_dt()}] --> Final Result saved! There are total {df.shape[0]} timesteps")
    
    return df

## Scraping Data (Wind @850 hPa)

In [13]:
start_year, start_month, start_date, start_hour = 2020, 7, 1, 0
end_year, end_month, end_date, end_hour = 2021, 7, 1, 23

START_DT = datetime(start_year, start_month, start_date, start_hour)
END_DT = datetime(end_year, end_month, end_date, end_hour)

# scrape for wind @850hPa
wind_data = scrape_wind_or_temp_data(START_DT, END_DT, LOCS, "wind")



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome


Starting to scrape wind data, from 2020-07-01 00:00:00 to 2021-07-01 23:00:00
Interested Areas: ['BKK', 'Chiangmai', 'Khonkaen', 'Rayong', 'Saraburi', 'Surat']

[2022-04-15 21:15:12] --> Initiating webdriver and variables involved...


Driver [/Users/james/.wdm/drivers/chromedriver/mac64/100.0.4896.60/chromedriver] found in cache


[2022-04-15 21:15:25] --> Finish Initiating, Starting to scrape...
[2022-04-15 21:16:04] --> Saving Latest Data - Timestamp: 2020-07-07 01:00:00
[2022-04-15 21:16:42] --> Saving Latest Data - Timestamp: 2020-07-13 07:00:00
[2022-04-15 21:17:21] --> Saving Latest Data - Timestamp: 2020-07-19 13:00:00
[2022-04-15 21:18:00] --> Saving Latest Data - Timestamp: 2020-07-25 19:00:00
[2022-04-15 21:18:39] --> Saving Latest Data - Timestamp: 2020-08-01 01:00:00
[2022-04-15 21:19:17] --> Saving Latest Data - Timestamp: 2020-08-07 07:00:00
[2022-04-15 21:19:55] --> Saving Latest Data - Timestamp: 2020-08-13 13:00:00
[2022-04-15 21:20:34] --> Saving Latest Data - Timestamp: 2020-08-19 19:00:00
[2022-04-15 21:21:12] --> Saving Latest Data - Timestamp: 2020-08-26 01:00:00
[2022-04-15 21:21:52] --> Saving Latest Data - Timestamp: 2020-09-01 07:00:00
[2022-04-15 21:22:31] --> Saving Latest Data - Timestamp: 2020-09-07 13:00:00
[2022-04-15 21:23:11] --> Saving Latest Data - Timestamp: 2020-09-13 19:00:

[2022-04-16 00:09:34] --> Saving Latest Data - Timestamp: 2021-06-09 15:00:00
[2022-04-16 00:11:32] --> Saving Latest Data - Timestamp: 2021-06-11 17:00:00
[2022-04-16 00:13:35] --> Saving Latest Data - Timestamp: 2021-06-13 19:00:00
[2022-04-16 00:15:29] --> Saving Latest Data - Timestamp: 2021-06-15 21:00:00
[2022-04-16 00:17:28] --> Saving Latest Data - Timestamp: 2021-06-17 23:00:00
[2022-04-16 00:19:23] --> Saving Latest Data - Timestamp: 2021-06-20 01:00:00
[2022-04-16 00:21:33] --> Saving Latest Data - Timestamp: 2021-06-22 03:00:00
[2022-04-16 00:23:42] --> Saving Latest Data - Timestamp: 2021-06-24 05:00:00
[2022-04-16 00:25:41] --> Saving Latest Data - Timestamp: 2021-06-26 07:00:00
[2022-04-16 00:27:39] --> Saving Latest Data - Timestamp: 2021-06-28 09:00:00
[2022-04-16 00:29:41] --> Saving Latest Data - Timestamp: 2021-06-30 11:00:00
[2022-04-16 00:31:02] --> Finish Scraping! Webdriver closed and Saving Final Result...
[2022-04-16 00:31:02] --> Final Result saved! There are

In [14]:
wind_data

Unnamed: 0,date_time,wind_speed_BKK,wind_direction_BKK,wind_speed_Chiangmai,wind_direction_Chiangmai,wind_speed_Khonkaen,wind_direction_Khonkaen,wind_speed_Rayong,wind_direction_Rayong,wind_speed_Saraburi,wind_direction_Saraburi,wind_speed_Surat,wind_direction_Surat
0,2020-06-30 22:00:00,35,270,17,225,21,230,47,270,41,250,23,260
1,2020-07-01 01:00:00,42,260,24,265,37,240,41,260,41,255,21,255
2,2020-07-01 04:00:00,55,270,23,280,39,255,46,260,49,265,29,255
3,2020-07-01 07:00:00,53,250,18,265,36,260,40,255,52,260,35,250
4,2020-07-01 10:00:00,51,260,22,265,34,265,38,250,61,260,36,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5831,2021-07-01 19:00:00,18,250,23,250,31,225,25,255,23,240,9,245
5832,2021-07-01 20:00:00,21,255,25,250,31,225,26,255,24,235,9,245
5833,2021-07-01 21:00:00,20,250,26,255,30,225,26,250,22,240,9,235
5834,2021-07-01 22:00:00,16,245,27,260,28,220,24,245,25,240,9,235


## Scraping Data (Temp @Surface)

In [15]:
# scrape for temp surface
start_year, start_month, start_date, start_hour = 2020, 7, 1, 0
end_year, end_month, end_date, end_hour = 2021, 7, 1, 23

START_DT = datetime(start_year, start_month, start_date, start_hour)
END_DT = datetime(end_year, end_month, end_date, end_hour)

temp_data = scrape_wind_or_temp_data(START_DT, END_DT, LOCS, "temp")



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome


Starting to scrape temp data, from 2020-07-01 00:00:00 to 2021-07-01 23:00:00
Interested Areas: ['BKK', 'Chiangmai', 'Khonkaen', 'Rayong', 'Saraburi', 'Surat']

[2022-04-16 00:32:50] --> Initiating webdriver and variables involved...


Trying to download new driver from https://chromedriver.storage.googleapis.com/100.0.4896.60/chromedriver_mac64.zip
Driver has been saved in cache [/Users/james/.wdm/drivers/chromedriver/mac64/100.0.4896.60]


[2022-04-16 00:33:05] --> Finish Initiating, Starting to scrape...
[2022-04-16 00:33:44] --> Saving Latest Data - Timestamp: 2020-07-07 01:00:00
[2022-04-16 00:34:50] --> Saving Latest Data - Timestamp: 2020-07-13 07:00:00
[2022-04-16 00:36:50] --> Saving Latest Data - Timestamp: 2020-07-19 13:00:00
[2022-04-16 00:38:50] --> Saving Latest Data - Timestamp: 2020-07-25 19:00:00
[2022-04-16 00:40:57] --> Saving Latest Data - Timestamp: 2020-08-01 01:00:00
[2022-04-16 00:43:11] --> Saving Latest Data - Timestamp: 2020-08-07 07:00:00
[2022-04-16 00:45:07] --> Saving Latest Data - Timestamp: 2020-08-13 13:00:00
[2022-04-16 00:47:16] --> Saving Latest Data - Timestamp: 2020-08-19 19:00:00
[2022-04-16 00:49:14] --> Saving Latest Data - Timestamp: 2020-08-26 01:00:00
[2022-04-16 00:51:17] --> Saving Latest Data - Timestamp: 2020-09-01 07:00:00
[2022-04-16 00:53:20] --> Saving Latest Data - Timestamp: 2020-09-07 13:00:00
[2022-04-16 00:55:19] --> Saving Latest Data - Timestamp: 2020-09-13 19:00:

[2022-04-16 04:03:54] --> Saving Latest Data - Timestamp: 2021-06-09 15:00:00
[2022-04-16 04:05:51] --> Saving Latest Data - Timestamp: 2021-06-11 17:00:00
[2022-04-16 04:07:41] --> Saving Latest Data - Timestamp: 2021-06-13 19:00:00
[2022-04-16 04:09:36] --> Saving Latest Data - Timestamp: 2021-06-15 21:00:00
[2022-04-16 04:11:37] --> Saving Latest Data - Timestamp: 2021-06-17 23:00:00
[2022-04-16 04:13:38] --> Saving Latest Data - Timestamp: 2021-06-20 01:00:00
[2022-04-16 04:15:35] --> Saving Latest Data - Timestamp: 2021-06-22 03:00:00
[2022-04-16 04:17:39] --> Saving Latest Data - Timestamp: 2021-06-24 05:00:00
[2022-04-16 04:19:38] --> Saving Latest Data - Timestamp: 2021-06-26 07:00:00
[2022-04-16 04:21:49] --> Saving Latest Data - Timestamp: 2021-06-28 09:00:00
[2022-04-16 04:23:48] --> Saving Latest Data - Timestamp: 2021-06-30 11:00:00
[2022-04-16 04:25:22] --> Finish Scraping! Webdriver closed and Saving Final Result...
[2022-04-16 04:25:22] --> Final Result saved! There are

In [16]:
temp_data

Unnamed: 0,date_time,temp_BKK,temp_Chiangmai,temp_Khonkaen,temp_Rayong,temp_Saraburi,temp_Surat
0,2020-06-30 22:00:00,29.4,25.2,27.6,28.5,25.9,25.9
1,2020-07-01 01:00:00,28.5,24.0,26.7,28.2,25.5,25.2
2,2020-07-01 04:00:00,28.3,23.4,25.5,27.8,25.1,24.7
3,2020-07-01 07:00:00,28.7,24.1,26.0,28.2,25.8,25.4
4,2020-07-01 10:00:00,29.7,29.7,29.4,29.6,28.7,29.3
...,...,...,...,...,...,...,...
5831,2021-07-01 19:00:00,29.1,24.6,30.7,28.5,29.7,26.8
5832,2021-07-01 20:00:00,29.5,24.1,29.8,28.3,28.0,26.2
5833,2021-07-01 21:00:00,29.8,23.9,28.9,28.2,27.0,25.9
5834,2021-07-01 22:00:00,29.1,23.5,28.0,28.2,26.0,25.7


# Scraping for Hotspot
Daily in 5 countries (Thailand, Myanmar, Cambodia, Lao_PDR, P_Malaysia)

Retrieved From: http://asmc.asean.org/asmc-haze-hotspot-daily-new/#Hotspot

## Functions involved for scraping Hotspot Data

In [4]:
def scrape_hotspot_data(start_date, end_date, REGIONS):
    '''
    This function will make a HTTP Post Request to url to get daily hotspot data (both daytime and nighttime)
    [Input]
        start_date, end_date - python datetime object
        REGIONS - list of regions
    [Output]
        dfs - dict containing keys: {'day', 'night'} with each contain a dataframe as a value
    '''
    
    url = 'http://asmc.asean.org/wp-content/themes/asmctheme/page-functions/functions-ajax-haze-daily-hotspot-count-new.php'
    daynights = ['day', 'night']
    
    print(f'[{get_current_dt()}] --> Start scraping daily hotspot from date {start_date.strftime("%-d %b, %Y")} to date {end_date.strftime("%-d %b, %Y")}')

    dfs = {}
    for daynight in daynights:
        date = (end_date + timedelta(days = 1)).strftime("%-d %b, %Y")
        pastDays = (end_date - start_date).days + 1
        data = {'date': date,
                'pastDays': pastDays,
                'regions[]': REGIONS,
                'daynight': daynight,
                'conf': 'High'} #confidence level
    
        print(f'\n[{get_current_dt()}] --> Posting HTTP Request for {daynight} time...,\n Parameters: {data}')
        response = requests.post(url, data = data).json()

        DATA = []
        for d in response:
            date = datetime.strptime(d['date'], '%Y-%m-%d')
            DATA.append([date] + [d[r] for r in REGIONS])

        #column names
        column_names = ['date']
        for r in REGIONS:
            column_names.append(f'hotspot_{r}_{daynight}'.lower())

        #create dataframe
        df = pd.DataFrame(DATA, columns = column_names)
        df.sort_values(by = 'date', inplace = True)
        df.reset_index(drop = True, inplace = True)
        
        #since there might be some missing data
        date_range = pd.date_range(start = start_date, end = end_date, freq = 'D').to_frame(name = "date").set_index("date")
        df = pd.merge(df, date_range, on = 'date', how = 'outer')
        df.sort_values(by = 'date', inplace = True)
        
        dfs[daynight] = df
        print(f'[{get_current_dt()}] --> Getting {daynight} response, total of {df.shape[0]} entries...')
        
    print(f'\n[{get_current_dt()}] --> Finish Scraping!')
    return dfs

### Scraping Data (Daily Hotspot)

In [5]:
REGIONS = ['Thailand', 'Myanmar', 'Cambodia', 'Lao_PDR', 'P_Malaysia'] #P_Malaysia = Peninsula Malaysia
start_date = datetime(2017, 7, 1)
end_date = datetime(2021, 7, 1)

dfs = scrape_hotspot_data(start_date, end_date, REGIONS)

[2022-04-15 19:05:17] --> Start scraping daily hotspot from date 1 Jul, 2017 to date 1 Jul, 2021

[2022-04-15 19:05:17] --> Posting HTTP Request for day time...,
 Parameters: {'date': '2 Jul, 2021', 'pastDays': 1462, 'regions[]': ['Thailand', 'Myanmar', 'Cambodia', 'Lao_PDR', 'P_Malaysia'], 'daynight': 'day', 'conf': 'High'}
[2022-04-15 19:05:17] --> Getting day response, total of 1462 entries...

[2022-04-15 19:05:17] --> Posting HTTP Request for night time...,
 Parameters: {'date': '2 Jul, 2021', 'pastDays': 1462, 'regions[]': ['Thailand', 'Myanmar', 'Cambodia', 'Lao_PDR', 'P_Malaysia'], 'daynight': 'night', 'conf': 'High'}
[2022-04-15 19:05:17] --> Getting night response, total of 1462 entries...

[2022-04-15 19:05:17] --> Finish Scraping!


In [6]:
print(dfs['day'].info())
print(f"\nMissing Dates: \n {dfs['day'][dfs['day'].isnull().any(axis=1)]['date']}")
dfs['day']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1462 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date                    1462 non-null   datetime64[ns]
 1   hotspot_thailand_day    1461 non-null   float64       
 2   hotspot_myanmar_day     1461 non-null   float64       
 3   hotspot_cambodia_day    1461 non-null   float64       
 4   hotspot_lao_pdr_day     1461 non-null   float64       
 5   hotspot_p_malaysia_day  1461 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 80.0 KB
None

Missing Dates: 
 1461   2019-05-08
Name: date, dtype: datetime64[ns]


Unnamed: 0,date,hotspot_thailand_day,hotspot_myanmar_day,hotspot_cambodia_day,hotspot_lao_pdr_day,hotspot_p_malaysia_day
0,2017-07-01,0.0,0.0,3.0,0.0,1.0
1,2017-07-02,0.0,0.0,0.0,0.0,0.0
2,2017-07-03,0.0,0.0,0.0,0.0,0.0
3,2017-07-04,0.0,0.0,0.0,0.0,4.0
4,2017-07-05,0.0,0.0,1.0,0.0,6.0
...,...,...,...,...,...,...
1456,2021-06-27,6.0,0.0,14.0,0.0,2.0
1457,2021-06-28,0.0,0.0,26.0,0.0,0.0
1458,2021-06-29,0.0,0.0,4.0,2.0,0.0
1459,2021-06-30,0.0,0.0,5.0,0.0,1.0


In [7]:
print(dfs['night'].info())
print(f"\nMissing Dates: \n{dfs['night'][dfs['night'].isnull().any(axis=1)]['date']}")
dfs['night']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1462 entries, 0 to 1351
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      1462 non-null   datetime64[ns]
 1   hotspot_thailand_night    1352 non-null   float64       
 2   hotspot_myanmar_night     1352 non-null   float64       
 3   hotspot_cambodia_night    1352 non-null   float64       
 4   hotspot_lao_pdr_night     1352 non-null   float64       
 5   hotspot_p_malaysia_night  1352 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 80.0 KB
None

Missing Dates: 
1352   2017-08-13
1353   2017-08-18
1354   2017-09-04
1355   2017-09-12
1356   2017-09-23
          ...    
1457   2020-09-16
1458   2020-09-21
1459   2020-09-23
1460   2020-09-30
1461   2020-10-01
Name: date, Length: 110, dtype: datetime64[ns]


Unnamed: 0,date,hotspot_thailand_night,hotspot_myanmar_night,hotspot_cambodia_night,hotspot_lao_pdr_night,hotspot_p_malaysia_night
0,2017-07-01,0.0,0.0,0.0,0.0,0.0
1,2017-07-02,0.0,0.0,0.0,0.0,0.0
2,2017-07-03,0.0,0.0,0.0,0.0,0.0
3,2017-07-04,0.0,0.0,0.0,0.0,0.0
4,2017-07-05,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1347,2021-06-27,0.0,0.0,0.0,0.0,0.0
1348,2021-06-28,0.0,0.0,0.0,0.0,0.0
1349,2021-06-29,0.0,0.0,0.0,0.0,0.0
1350,2021-06-30,0.0,0.0,0.0,0.0,0.0


In [8]:
# Combine hot spot at day time and night time together
hotspot_data = pd.merge(dfs['day'], dfs['night'], on = 'date', how = 'outer')
hotspot_data.sort_values(by = 'date', inplace = True)

# rearrange columns
n_regions = len(dfs['day'].columns) - 1
new_column_names = ['date']
for r in REGIONS:
    new_column_names.append(f'hotspot_{r}_day'.lower())
    new_column_names.append(f'hotspot_{r}_night'.lower())

hotspot_data = hotspot_data[new_column_names]
hotspot_data

Unnamed: 0,date,hotspot_thailand_day,hotspot_thailand_night,hotspot_myanmar_day,hotspot_myanmar_night,hotspot_cambodia_day,hotspot_cambodia_night,hotspot_lao_pdr_day,hotspot_lao_pdr_night,hotspot_p_malaysia_day,hotspot_p_malaysia_night
0,2017-07-01,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0
1,2017-07-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-07-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-07-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,2017-07-05,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1457,2021-06-27,6.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,2.0,0.0
1458,2021-06-28,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0
1459,2021-06-29,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0
1460,2021-06-30,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# save into csv
save_result_from_scraping(hotspot_data, 'hotspot')