# Fetch raw data and validate

In [2]:
from pathlib import Path
import logging
from io import StringIO
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromiumService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
import os

def download_one_file_of_raw_data(year: int, month: int) -> Path:
    """
    This function takes a year and a month parameter and downloads the 
    NB Power System Information for the chosen year-month as a parquet file
    """
    
    # Chrome driver settings
    option = webdriver.ChromeOptions()
    option.add_argument("start-maximized")
    prefs = {'download.default_directory' : os.getcwd()+'/data/raw'}
    option.add_experimental_option('prefs', prefs)

    URL = 'https://tso.nbpower.com/Public/en/system_information_archive.aspx'
    
    # Validate parameter types
    if not isinstance(year, int):
          raise TypeError("Interger value required for Year parameter" )
    if not isinstance(month, int):
          raise TypeError("Interger value required for Month parameter" )
    
    # Configure logger
    logging.basicConfig(filename=f'../log/extract_nb_power_{year}_{month:02d}.log',
                        level=logging.ERROR)
    logger.info('Start Extract Session')
    try:
        # Initialize Chrome WebDriver 
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
        
        # Load page
        driver.get(URL)
    
        # Set menus
        month_sel = driver.find_element("xpath",'/html/body/form/div[5]/div[1]/table[1]/tbody/tr[2]/td[1]/select')
        year_sel = driver.find_element("xpath",'//*[@id="ctl00_cphMainContent_ddlYear"]')
      
        # Use input parameters to select year
        year_dict = {val.text: index for index,val in enumerate(year_sel.find_elements(By.TAG_NAME,"option"))}
        year_menu = year_sel.find_elements(By.TAG_NAME,"option")
        year_menu[year_dict[str(year)]].click()
    
        # Use input parameters to select month
        month_dict = {val.text: index for index,val in enumerate(month_sel.find_elements(By.TAG_NAME,"option"))}
        month_menu = month_sel.find_elements(By.TAG_NAME,"option")
        month_menu[month_dict[str(month)]].click()
    
        # View data
        view_data_button = driver.find_element("xpath",'//*[@id="ctl00_cphMainContent_lbGetData"]')
        view_data_button.click()
    
        # Parse data into data frame then write to parquet
        output_text=driver.find_element(By.TAG_NAME,"pre").text
        csvStringIO = StringIO(output_text)
        df = pd.read_csv(csvStringIO, sep=",", header=0)
        
        path = f'../data/raw/nb_power_{year}_{month:02d}.parquet'
        df.to_parquet(path)
        logger.info('Extract completed!')
        return path

    
    except KeyError:
        # Close the WebDriver
        driver.quit()
        print("Invalid year and month parameters selected")
        
    except WebDriverException:
        # Close the WebDriver
        driver.quit()
        print("Base URL no longer valid - leads to NoSuchElementException")
        
    except Exception as e:
        # Close the WebDriver
        driver.quit()
        
        # Log the exception along with additional information
        logging.error('An error occurred: %s', str(e))
        print('An error occurred: %s', str(e))
    
    else:
        # Close the WebDriver
        driver.quit()

In [3]:
download_one_file_of_raw_data(year=2023, month=1)

'../data/raw/nb_power_2023_01.parquet'

In [4]:
download_one_file_of_raw_data(year=2025, month=1)

Invalid year and month parameters selected


In [5]:
download_one_file_of_raw_data(year='2020', month=1)

TypeError: Interger value required for Year parameter

In [6]:
download_one_file_of_raw_data(year=2020, month='1')

TypeError: Interger value required for Month parameter