In [8]:
import time
import pandas as pd
from datetime import datetime as dt
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException, WebDriverException
import os

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

start_date = dt(2019, 5, 12)
end_date = dt(2019, 12, 31)
station_id = "488200"
base_url = "https://meteologix.com/vn/observations/vietnam/wind-direction/{}-{}z.html"

urls = [
    base_url.format(date.strftime('%Y%m%d'), f"{hour:02d}00")
    for date in pd.date_range(start_date, end_date)
    for hour in range(24)
]

output_file = "wind_direction/HaNoi_wind_direction_2019.csv"
error_log_file = "wind_direction/failed_urls.txt"
batch_size = 100

def initialize_csv():
    if not os.path.exists(output_file):
        df = pd.DataFrame(columns=["date", "station_id", "time", "wind direction"])
        df.to_csv(output_file, index=False, mode='w')

def log_error(url):
    with open(error_log_file, 'a') as f:
        f.write(f"{url}\n")

def fetch_data(url):
    print(url)
    data = []
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, f"[data-station-id='{station_id}']"))
        )
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        element = soup.find(attrs={"data-station-id": station_id})
        
        if element:
            title = element.get("title")
            date = url.split('/')[-1].split('-')[0]
            station_data = {"date": date, "station_id": station_id}

            if title:
                parts = title.split('|')
                if len(parts) >= 3:
                    time_value = parts[2].strip()
                    wind_direction = parts[0].strip()
                    station_data.update({"time": time_value, "wind direction": wind_direction})
                else:
                    station_data.update({"time": None, "wind direction": None})
            else:
                station_data.update({"time": None, "wind direction": None})
            data.append(station_data)
    except (TimeoutException, WebDriverException) as e:
        print(f"Error fetching data for URL {url}: {e}")
        log_error(url)  
        time.sleep(10)  
    finally:
        driver.quit()
    return data

def save_batch_to_csv(batch_data):
    df = pd.DataFrame(batch_data)
    df.to_csv(output_file, index=False, mode='a', header=False)  

initialize_csv() 

batch_data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    for i, result in enumerate(executor.map(fetch_data, urls), start=1):
        batch_data.extend(result)
        
        if i % batch_size == 0 and batch_data:
            save_batch_to_csv(batch_data)
            batch_data.clear() 

if batch_data:
    save_batch_to_csv(batch_data)


https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0000z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0100z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0200z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0300z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0400z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0500z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0600z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0700z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0800z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-0900z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-1000z.html
https://meteologix.com/vn/observations/vietnam/wind-direction/20190512-1100z.html
https://meteolog