# Collecting Data

In [133]:
from bs4 import BeautifulSoup
import os
import requests
import urllib.robotparser
import pandas as pd 
import datetime 
import time 
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException

In [134]:
os.makedirs('data')

In [135]:
def get_weather_data(start_date, end_date):
    # Open web with selenium
    browser = webdriver.Chrome()
    browser.get('https://www.worldweatheronline.com/quy-nhon-weather-history/vn.aspx')

    # Create csv file
    file_name = f'data/weatherQN_{start_date.year}.csv'
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write('Date\tTime\tWeather\tTemp\tRain\tCloud\tPressure\tWind\tGust\tDir\n')

        # Go through each day
        current_date = start_date
        while current_date < end_date:
            # Chuyển ngày sang định dạng phù hợp
            date_str = current_date.strftime('%Y-%m-%d')

            try:
                # Change input date
                input_date = browser.find_element(By.ID, 'ctl00_MainContentHolder_txtPastDate')
                input_date.clear()
                browser.execute_script("arguments[0].value = arguments[1];", input_date, date_str)
                
                # Remove ads or overlay
                try:
                    overlay = browser.find_element(By.CLASS_NAME, 'ad-message')  # Thay 'ad-message' bằng class thực tế
                    browser.execute_script("arguments[0].style.display = 'none';", overlay)
                except NoSuchElementException:
                    pass 

                # Click submit
                submit_button = browser.find_element(By.ID, 'ctl00_MainContentHolder_butShowPastWeather')
                browser.execute_script("arguments[0].scrollIntoView(true);", submit_button)
                time.sleep(1) 
                try:
                    submit_button.click()
                except ElementClickInterceptedException:
                    print(f"Element click intercepted for date {date_str}, trying JavaScript click.")
                    browser.execute_script("arguments[0].click();", submit_button)

                time.sleep(2)

                # Parse HTML with BeautifulSoup
                soup = BeautifulSoup(browser.page_source, 'html.parser')

                # Get data from table
                table = soup.find('table', class_='days-details-table')
                rows = table.find_all('tr')
                rows = rows[1:]

                for row in rows:
                    columns = row.find_all('td')
                    
                    if len(columns) >= 9:
                        time_data = columns[0].get_text(strip=True)

                        weather_img = columns[1].find('img')
                        weather = weather_img['alt'] if weather_img else "N/A" 

                        temp = columns[2].get_text(strip=True) 
                        
                        rain_div = columns[3].find('div', class_='days-rain-number')
                        rain = rain_div.get_text(strip=True) if rain_div else "N/A"

                        cloud = columns[4].get_text(strip=True)  
                        pressure = columns[5].get_text(strip=True)  
                        wind_speed = columns[6].get_text(strip=True)  
                        wind_gust = columns[7].get_text(strip=True)  
                        wind_dir = columns[8].find('svg')['style'] if columns[8].find('svg') else "N/A" 

                        # Save file
                        f.write(f"{date_str}\t{time_data}\t{weather}\t{temp}\t{rain}\t{cloud}\t{pressure}\t{wind_speed}\t{wind_gust}\t{wind_dir}\n")

            except NoSuchElementException as e:
                print(f"Error: Cannot find element - {e}")
            except Exception as e:
                print(f"Unknown error: {e}")

            # Go to next page
            current_date += datetime.timedelta(days=1)

    browser.quit()

In [None]:
start_2009 = datetime.datetime(2009, 1, 1)
end_2009 = datetime.datetime(2009, 12, 31)
get_weather_data(start_2009, end_2009)

In [None]:
start_2010 = datetime.datetime(2010, 1, 1)
end_2010 = datetime.datetime(2010, 12, 31)
get_weather_data(start_2010, end_2010)

In [None]:
start_2011 = datetime.datetime(2011, 1, 1)
end_2011 = datetime.datetime(2011, 12, 31)
get_weather_data(start_2011, end_2011)

In [None]:
start_2012 = datetime.datetime(2012, 1, 1)
end_2012 = datetime.datetime(2012, 12, 31)
get_weather_data(start_2012, end_2012)

In [None]:
start_2013 = datetime.datetime(2013, 1, 1)
end_2013 = datetime.datetime(2013, 1, 31)
get_weather_data(start_2013, end_2013)

In [None]:
start_2014 = datetime.datetime(2014, 1, 1)
end_2014 = datetime.datetime(2014, 12, 31)
get_weather_data(start_2014, end_2014)

In [None]:
start_2015 = datetime.datetime(2015, 1, 1)
end_2015 = datetime.datetime(2015, 12, 31)
get_weather_data(start_2015, end_2015)

In [None]:
start_2016 = datetime.datetime(2016, 1, 1)
end_2016 = datetime.datetime(2016, 12, 31)
get_weather_data(start_2016, end_2016)

In [None]:
start_2017 = datetime.datetime(2017, 1, 1)
end_2017 = datetime.datetime(2017, 12, 31)
get_weather_data(start_2017, end_2017)

In [None]:
start_2018 = datetime.datetime(2018, 1, 1)
end_2018 = datetime.datetime(2018, 12, 31)
get_weather_data(start_2018, end_2018)

In [None]:
start_2019 = datetime.datetime(2019, 1, 1)
end_2019 = datetime.datetime(2019, 12, 31)
get_weather_data(start_2019, end_2019)

In [None]:
start_2020 = datetime.datetime(2020, 1, 1)
end_2020 = datetime.datetime(2020, 12, 31)
get_weather_data(start_2020, end_2020)

In [None]:
start_2021 = datetime.datetime(2021, 1, 1)
end_2021 = datetime.datetime(2021, 12, 31)
get_weather_data(start_2009, end_2009)

In [None]:
start_2022 = datetime.datetime(2022, 1, 1)
end_2022 = datetime.datetime(2022, 12, 31)
get_weather_data(start_2022, end_2022)

In [None]:
start_2023 = datetime.datetime(2023, 1, 1)
end_2023 = datetime.datetime(2023, 12, 31)
get_weather_data(start_2023, end_2023)

In [None]:
start_2024 = datetime.datetime(2024, 1, 1)
end_2024 = datetime.datetime(2024, 12, 31)
get_weather_data(start_2024, end_2024)