<a href="https://colab.research.google.com/github/jinyjib98/comp5339/blob/main/comp5339_a1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COMP5339 Assignment 1

## Import Packages

In [1]:
!pip install selenium
!pip install pyperclip



In [83]:
# Data Acquisition
import requests
import os
import pandas as pd
from pathlib import Path
import time
import pyperclip

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

# Data Integration and Cleaning
import numpy as np
import re
import json
from datetime import datetime


# Data Augmentation

# Data Transformation and Storage

## Data Acquisition

In [20]:
class DataRetriever:
    def __init__(self, output_dir='./data'):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.driver = None
        self.wait = None

    # Download a file using HTTP request
    def download_file_http(self, url, filename, subfolder):

        try:
            save_dir = self.output_dir / subfolder
            save_dir.mkdir(exist_ok=True) # Create subfolder if it doesn't exist
            filepath = save_dir / filename

            print(f'Downloading: {filename}')
            print(f'From: {url}')

            response = self.session.get(url, stream=True, timeout=30)
            response.raise_for_status()

            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            print(f'Downloaded: {filename}')
            return filepath

        except Exception as e:
            print(f'Failed to download {filename}: {str(e)}')
            return None

    # Set up Selenium driver
    def setup_selenium_driver(self, subfolder):
            # Set up Chrome options for Selenium script
            chrome_options = Options()
            # chrome_options.add_argument("--headless")

            # Specify anti-detection options
            chrome_options.add_argument("--disable-web-security")
            chrome_options.add_argument("--allow-running-insecure-content")
            chrome_options.add_argument("--disable-extensions")
            chrome_options.add_argument("--disable-blink-features=AutomationControlled")
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")

            # Set download directory
            download_path = str((self.output_dir / subfolder).absolute())
            Path(download_path).mkdir(exist_ok=True)
            print(f'Setting download directory to: {download_path}')

            prefs = {
                "download.default_directory": download_path,
                "download.prompt_for_download": False,
                "download.directory_upgrade": True,
                "safebrowsing.enabled": True,
                "safebrowsing.disable_download_protection": True,
                "download.extensions_to_open": "",
                "download.open_pdf_in_system_reader": False,
                "plugins.always_open_pdf_externally": True
            }

            chrome_options.add_experimental_option("prefs", prefs)

            # Initialise WebDriver
            try:
                self.driver = webdriver.Chrome(options=chrome_options)
                self.wait = WebDriverWait(self.driver, 120)
                print('Chrome WebDriver initialized')
                return True
            except Exception as e:
                print(f'Failed to initialize WebDriver: {str(e)}')
                return False

    # Close WebDriver
    def close_driver(self):

        if self.driver:
            self.driver.quit()
            self.driver = None
            self.wait = None

    # Specify timeout for download
    def wait_for_download(self, download_dir, timeout=120):
        print('Waiting for download to complete...')

        start_time = time.time()
        initial_files = set(os.listdir(download_dir))

        while time.time() - start_time < timeout:
            current_files = set(os.listdir(download_dir))
            new_files = current_files - initial_files

            if new_files:
                # Check if any files are still downloading (.crdownload extension)
                downloading = [f for f in new_files if f.endswith('.crdownload')]
                if not downloading:
                    print(f'Download complete: {list(new_files)}')
                    return list(new_files)

            # Show progress every 10 seconds
            elapsed = time.time() - start_time
            if int(elapsed) % 10 == 0 and elapsed > 0:
                print(f'Waiting... ({elapsed:.0f}s elapsed)')

            time.sleep(1)

        print(f'Download timeout after {timeout} seconds')
        return []

    # Retrieve NGER data
    def retrieve_cer_nger_data(self):
        '''
        How it works:
        Find the API Copy button and click it
        '''
        print('\n=== Task 1: Retrieving CER NGER Data ===')

        if not self.setup_selenium_driver('cer_nger'):
            return []

        try:
            url = 'https://data.cer.gov.au/datasets/NGER/ID0243'
            print(f"Loading: {url}")
            self.driver.get(url)

            # Wait for page to load
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            time.sleep(5)  # Wait for dynamic content to load

            # Click the Copy API URL button
            api_button = self.wait.until(EC.element_to_be_clickable(
                (By.XPATH, "//button[.//span[contains(text(), 'Copy API URL')]]")))

            api_button.click()
            api_url = pyperclip.paste()
            
            print(f"API URL: {api_url}")
            
            api_call = requests.get(api_url) # get the content of the API
            api_call.raise_for_status()
            nger_json = api_call.json()


            df = pd.DataFrame(nger_json)

            download_dir = self.output_dir / 'cer_nger'
            download_dir.mkdir(exist_ok = True)
            filepath = download_dir / 'NGER.ID0243.csv' # save the content as a csv file
            df.to_csv(filepath, index = False)
        
        except Exception as e:
            print(f"Error retrieving NGER data: {str(e)}")

        finally:
            self.close_driver()


    # Retrieve CER Renewable Energy Data
    def retrieve_cer_renewable_data(self):
        '''
        How it works:
        Download the files using BeautifulSoup
        '''
        print('\n=== Task 2: Retrieving CER Renewable Energy Data ===')

        target_files = []
        downloaded_files = []

        url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        file_tags = soup.find_all('a', class_ = 'cer-accordion--table cer-button button--secondary')
        for file in file_tags:
            href = file.get('href', '')
            text = file.get_text(strip = True).lower()

            if 'csv' in text and ('power stations' in text and 'projects' in text):
                full_url = f'https://www.cer.gov.au{href}'

                target_files.append({
                    'url': full_url,
                    'filename': f"{full_url.split('/')[-1]}.csv"
                })

        for i, target in enumerate(target_files, 1):
            print(f"\nDownloading {i}/3: {target['filename']}")

            filepath = self.download_file_http(
                target['url'],
                target['filename'],
                'cer_renewable'
            )

            if filepath:
                downloaded_files.append(filepath)

            # Wait for 1 second to avoid overloading the server
            time.sleep(1)
        
        print(f"\nSuccessfully downloaded {len(downloaded_files)}/3 CER files")
        return downloaded_files
                    
                

    # Retrieve ABS Economy and Industry Data
    def retrieve_abs_data(self):
        '''
        How it works:
        Use Selenium to find the download link and click it
        '''
        print('\n=== Task 3: Retrieving ABS Economy and Industry Data ===')

        if not self.setup_selenium_driver('abs_data'):
            return []

        try:
            url = 'https://www.abs.gov.au/methodologies/data-region-methodology/2011-24'
            print(f'Loading: {url}')
            self.driver.get(url)

            # Wait for page to load
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            time.sleep(3)

            # Scroll to data downloads section
            downloads_section = self.driver.find_element(By.ID, 'data-downloads')
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", downloads_section)
            time.sleep(2)
            print('Found and scrolled to data downloads section')

            # Target the specific file
            target_href = '/methodologies/data-region-methodology/2011-24/14100DO0003_2011-24.xlsx'

            # Find the download link and click it
            download_link = self.driver.find_element(By.CSS_SELECTOR, f'a[href="{target_href}"]')

            print(f'Found ABS file: {download_link.text.strip()}')
            print(f'Downloading in progress...')

            # Scroll to element and click
            self.driver.execute_script('arguments[0].scrollIntoView(true);', download_link)
            time.sleep(2)
            download_link.click()
            print('Clicked download link')

            # Wait for download
            download_dir = self.output_dir / 'abs_data'
            downloaded_file = self.wait_for_download(download_dir, timeout=180)

            # Check if the file is downloaded
            if downloaded_file:
                print(f'Download ABS file')
            else:
                print('Download failed')

        except Exception as e:
            print(f'Error retrieving NGER data: {str(e)}')

        finally:
            self.close_driver()

        print(f'\nSuccessfully downloaded {len(downloaded_file)} ABS file')
        return downloaded_file


    # Function to run the whole script
    def run_script(self):
        print(f'Output directory: {self.output_dir.absolute()}')

        # Run all tasks
        self.retrieve_cer_nger_data()
        self.retrieve_cer_renewable_data()
        self.retrieve_abs_data()


        # List all downloaded files
        print(f'\nFiles in: {self.output_dir.absolute()}')

        cer_nger_dir = self.output_dir / 'cer_nger'
        if cer_nger_dir.exists():
            print(f'\nCER NGER files:')
            for file in sorted(cer_nger_dir.glob('*.csv')):
                print(f'    {file.name}')

        cer_dir = self.output_dir / 'cer_renewable'
        if cer_dir.exists():
            print(f'\nCER Renewable files:')
            for file in sorted(cer_dir.glob('*.csv')):
                print(f'    {file.name}')

        abs_dir = self.output_dir / 'abs_data'
        if abs_dir.exists():
            print(f'\nABS Economy files:')
            for file in sorted(abs_dir.glob('*.xlsx')):
                print(f'    {file.name}')

In [21]:
download = DataRetriever()
download.run_script()

Output directory: /Users/hyungjinkim/Desktop/USYD/2025-2/COMP5339/Assignment/comp5339/data

=== Task 1: Retrieving CER NGER Data ===
Setting download directory to: /Users/hyungjinkim/Desktop/USYD/2025-2/COMP5339/Assignment/comp5339/data/cer_nger
Chrome WebDriver initialized
Loading: https://data.cer.gov.au/datasets/NGER/ID0243
API URL: https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0243?select%3D%2A

=== Task 2: Retrieving CER Renewable Energy Data ===

Downloading 1/3: power-stations-and-projects-accredited.csv
Downloading: power-stations-and-projects-accredited.csv
From: https://www.cer.gov.au/document/power-stations-and-projects-accredited
Downloaded: power-stations-and-projects-accredited.csv

Downloading 2/3: power-stations-and-projects-committed.csv
Downloading: power-stations-and-projects-committed.csv
From: https://www.cer.gov.au/document/power-stations-and-projects-committed
Downloaded: power-stations-and-projects-committed.csv

Downloading 3/3: power

## Data Integration and Cleaning

In [None]:
base_dir = Path(''./data')

# Load CER NGER dataset
nger_file = base_dir / 'cer_nger' / 'NGER.ID0243.csv'
nger_df = pd.read_csv(nger_file)

In [25]:
nger_df.head()

Unnamed: 0,reportingentity,facilityname,type,state,electricityproductionGJ,electricityproductionMWh,totalscope1emissionstCO2e,totalscope2emissionstCO2e,totalemissionstCO2e,emissionintensitytCO2eMWh,gridconnected,grid,primaryfuel,importantnotes
0,ACCIONA ENERGY OCEANIA PTY LTD,Cathedral Rocks Wind Farm,F,SA,481948,133874,57,127.0,184,0.0,On,NEM,Wind,-
1,ACCIONA ENERGY OCEANIA PTY LTD,Gunning Wind Farm,F,NSW,491409,136502,50,218.0,268,0.0,On,NEM,Wind,-
2,ACCIONA ENERGY OCEANIA PTY LTD,Mortlake South Wind Farm,F,VIC,1019352,283153,202,1128.0,1330,0.0,On,NEM,Wind,-
3,ACCIONA ENERGY OCEANIA PTY LTD,Mt Gellibrand Wind Farm,F,VIC,1025451,284847,99,1273.0,1372,0.0,On,NEM,Wind,-
4,ACCIONA ENERGY OCEANIA PTY LTD,Waubra Wind Farm,F,VIC,1954964,543046,186,1114.0,1300,0.0,On,NEM,Wind,-


In [None]:
# nger_df.gridconnected.value_counts()
# nger_df.grid.value_counts()
# nger_df.groupby('state')['grid'].value_counts().reset_index()
# nger_df.type.value_counts()
# nger_df.loc[nger_df.type == 'C', :]
# nger_df.groupby('gridconnected')['grid'].value_counts().reset_index()


Unnamed: 0,state,grid,count
0,-,-,151
1,ACT,NEM,9
2,NSW,NEM,94
3,NSW,Off-grid,1
4,NT,Off-grid,63
5,NT,DKIS,5
6,NT,NEM,1
7,QLD,NEM,77
8,QLD,Off-grid,38
9,QLD,Mt Isa,2


In [73]:
# Drop unnecessary columns
dropped_nger = nger_df.copy()

# importantnotes
dropped_nger.importantnotes.value_counts() # >> most of the items are "-"
dropped_nger.drop(columns = ['importantnotes'], inplace = True)
dropped_nger.head()

# Drop correlated columns
dropped_nger.electricityproductionMWh == dropped_nger.electricityproductionGJ / 3.6
dropped_nger.drop(columns = ['electricityproductionGJ', 'totalemissionstCO2e', 'gridconnected'], inplace = True) # >> electricityproductionGJ is just same data using different unit
dropped_nger.drop(index = dropped_nger.loc[dropped_nger.type == 'C'].index.tolist(), inplace = True)
dropped_nger.drop(columns = ['type'], inplace = True)
dropped_nger = dropped_nger.reset_index(drop = True)
dropped_nger.head()


Unnamed: 0,reportingentity,facilityname,state,electricityproductionMWh,totalscope1emissionstCO2e,totalscope2emissionstCO2e,emissionintensitytCO2eMWh,grid,primaryfuel
0,ACCIONA ENERGY OCEANIA PTY LTD,Cathedral Rocks Wind Farm,SA,133874,57,127.0,0.0,NEM,Wind
1,ACCIONA ENERGY OCEANIA PTY LTD,Gunning Wind Farm,NSW,136502,50,218.0,0.0,NEM,Wind
2,ACCIONA ENERGY OCEANIA PTY LTD,Mortlake South Wind Farm,VIC,283153,202,1128.0,0.0,NEM,Wind
3,ACCIONA ENERGY OCEANIA PTY LTD,Mt Gellibrand Wind Farm,VIC,284847,99,1273.0,0.0,NEM,Wind
4,ACCIONA ENERGY OCEANIA PTY LTD,Waubra Wind Farm,VIC,543046,186,1114.0,0.0,NEM,Wind


In [79]:
dropped_nger.describe()

Unnamed: 0,electricityproductionMWh,totalscope1emissionstCO2e,totalscope2emissionstCO2e,emissionintensitytCO2eMWh
count,624.0,624.0,624.0,624.0
mean,351475.0,223688.8,3365.508013,0.330769
std,1455809.0,1419224.0,33415.316233,0.50401
min,0.0,0.0,0.0,0.0
25%,2582.5,22.75,0.0,0.0
50%,33650.0,409.5,10.5,0.07
75%,214469.2,2353.0,409.5,0.66
max,15689640.0,18531080.0,684673.0,8.57


In [80]:
dropped_nger.grid.value_counts()

grid
NEM         381
Off-grid    185
SWIS         43
NWIS          8
DKIS          5
Mt Isa        2
Name: count, dtype: int64

In [30]:
# Load CER Renewable datasets
renewable_dir = base_dir / "cer_renewable"

accredited = pd.read_csv(renewable_dir / 'power-stations-and-projects-accredited.csv')
committed = pd.read_csv(renewable_dir / 'power-stations-and-projects-committed.csv')
probable = pd.read_csv(renewable_dir / 'power-stations-and-projects-probable.csv')

In [81]:
accredited['Installed capacity (MW)'].sum()

np.float64(2593.565)

In [32]:
accredited.head(2)

Unnamed: 0,Accreditation code,Power station name,State,Postcode,Installed capacity (MW),Fuel Source (s),Accreditation start date,Approval date
0,SRPXQLE8,"Laura Johnson Home, Townview - Solar w SGU - QLD",QLD,4825,0.2265,Solar,15/10/2024,13/01/2025
1,SRPYNS39,Leppington - Solar - NSW,NSW,2179,0.732,Solar,22/11/2024,13/01/2025


### Accredited

In [84]:
accredited.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Accreditation code        280 non-null    object 
 1   Power station name        280 non-null    object 
 2   State                     280 non-null    object 
 3   Postcode                  280 non-null    int64  
 4   Installed capacity (MW)   280 non-null    float64
 5   Fuel Source (s)           280 non-null    object 
 6   Accreditation start date  280 non-null    object 
 7   Approval date             280 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 17.6+ KB


In [None]:
# Rename columns & drop unnecessary columns
accredited_clean = accredited.copy()
accredited_clean = accredited_clean.rename(columns = {
                                                        'Power station name': 'facilityname',
                                                        'State': 'state',
                                                        'Installed capacity (MW)': 'capacityMWh',
                                                        'Postcode': 'postcode',
                                                        'Fuel Source (s)': 'primaryfuel',
                                                        'Approval date': 'statusdate',
                                                        'Accreditation code': 'code'})


# Create a new column 'status' for distinguishing between accredited, committed, and probable
accredited_clean['status'] = 'accredited'

# Create a new column 'duration' for calculating the time span between the approval date and the start date
accredited_clean['duration'] = accredited_clean['statusdate'] - accredited_clean['Accreditation start date']
# Drop the 'Accreditation start date' column
accredited_clean.drop(columns = ['Accreditation start date'], inplace = True)

# Convert primaryfuel to Biomass if it contains Waste or Gas
accredited_clean.primaryfuel = np.where(accredited_clean.primaryfuel.str.contains('Waste|Gas', case = False, na = False), 'Biomass', accredited_clean.primaryfuel)
accredited_clean.primaryfuel.value_counts()

# Delete unnecessary information from facilityname


accredited_clean.head()

Unnamed: 0,code,facilityname,state,postcode,capacityMWh,primaryfuel,statusdate,status
0,SRPXQLE8,"Laura Johnson Home, Townview - Solar w SGU - QLD",QLD,4825,0.2265,Solar,13/01/2025,accredited
1,SRPYNS39,Leppington - Solar - NSW,NSW,2179,0.732,Solar,13/01/2025,accredited
2,SRPYNS58,Quakers Hillside Care Community - Solar w SGU ...,NSW,2763,0.1996,Solar,13/01/2025,accredited
3,SRPXVCN4,Rest Nominees - Solar wSGU - VIC,VIC,3008,0.1188,Solar,13/01/2025,accredited
4,SRPXQLF9,Retail First Mt Ommaney-Solar-QLD,QLD,4074,1.0004,Solar,13/01/2025,accredited


In [None]:
accredited2 = accredited_clean.copy()

def split_name(name):
    parts = [p.strip() for p in name.split('-')]
    parts = [p for p in parts if not re.search(r"\d+(\.\d+)?\s*kW", p, flags=re.IGNORECASE)]

    if len(parts) >= 3:
        facility = " - ".join(parts[:-2]).strip()
        fuel = parts[-2]
        state = parts[-1]

    elif len(parts) == 2:
        facility = parts[0]
        fuel = None
        state = parts[1]
    
    else:
        facility = parts[0]
        fuel = None
        state = None

    return pd.Series([facility, fuel, state])

accredited2[['facilityname1', 'primaryfuel1', 'state1']] = accredited2['facilityname'].apply(split_name)
accredited2.head()




Unnamed: 0,code,facilityname,state,postcode,capacityMWh,primaryfuel,statusdate,status,facilityname1,primaryfuel1,state1
70,SRPXQLF6,"Heidke St, Avoca - Solar w SGU - QLD",QLD,4670,0.198,Solar,17/03/2025,accredited,"Heidke St, Avoca",Solar w SGU,QLD
71,SRPYNS82,HNELHD Tamworth - Solar - NSW,NSW,2340,1.063,Solar,17/03/2025,accredited,HNELHD Tamworth,Solar,NSW
72,SRPYNS79,ISLHD Shoalhaven Hospital - Solar - NSW,NSW,2541,0.5,Solar,17/03/2025,accredited,ISLHD Shoalhaven Hospital,Solar,NSW
73,SRPYNS60,Kingspan Insulated Panels Pty Limited - Solar ...,NSW,2760,0.16,Solar,17/03/2025,accredited,Kingspan Insulated Panels Pty Limited,Solar w SGU,NSW
74,SRPXQLE5,Ornatas Tasmanian Lobster Hatchery - Solar - QLD,QLD,4816,0.158,Solar,17/03/2025,accredited,Ornatas Tasmanian Lobster Hatchery,Solar,QLD
75,WD00SA24,Goyder South Wind Farm 1B - Wind - SA,SA,5417,195.693,Wind,27/03/2025,accredited,Goyder South Wind Farm 1B,Wind,SA
76,SRPYNS08,Agile Energy - ECB - 348kW Solar w SGU - NSW,NSW,2250,0.348,Solar,31/03/2025,accredited,Agile Energy - ECB,348kW Solar w SGU,NSW
77,SRPVWAL0,ATCO Gas Australia Jandakot Microgrid - Solar ...,WA,6164,0.301,Solar,31/03/2025,accredited,ATCO Gas Australia Jandakot Microgrid,Solar,WA
78,SRPXQLH8,Caloundra NMI 3117361486 - Solar w SGU - QLD,QLD,4551,0.348,Solar,31/03/2025,accredited,Caloundra NMI 3117361486,Solar w SGU,QLD
79,SRPVWAM6,City of Stirling (Administration Centre) – Sol...,WA,6021,0.47,Solar,31/03/2025,accredited,City of Stirling (Administration Centre) – Sol...,,WA


### Committed

In [37]:
committed.head(2)

Unnamed: 0,Project Name,State,MW Capacity,Fuel Source,Committed Date (Month/Year)
0,East Rockingham Resource Recovery Facility,WA,29.0,Biomass,Dec-2019
1,Mangalore Renewable Energy Project,VIC,5.0,Solar,Sep-2021


In [None]:
# Rename columns & drop unnecessary columns
committed_clean = committed.copy()
committed_clean = committed_clean.rename(columns = {
                                                        'Project Name': 'facilityname',
                                                        'State': 'state',
                                                        'MW Capacity': 'capacityMWh',
                                                        'Postcode': 'postcode',
                                                        'Fuel Source (s)': 'primaryfuel',
                                                        'Approval date': 'statusdate',
                                                        'Accreditation code': 'code'})


accredited_clean['status'] = 'accredited'
accredited_clean.drop(columns = ['Accreditation start date'], inplace = True)

accredited_clean.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Project Name                 84 non-null     object 
 1   State                        84 non-null     object 
 2   MW Capacity                  84 non-null     float64
 3   Fuel Source                  84 non-null     object 
 4   Committed Date (Month/Year)  35 non-null     object 
 5   Accreditation code           280 non-null    object 
 6   Power station name           280 non-null    object 
 7   State                        280 non-null    object 
 8   Postcode                     280 non-null    float64
 9   Installed capacity (MW)      280 non-null    float64
 10  Fuel Source (s)              280 non-null    object 
 11  Accreditation start date     280 non-null    object 
 12  Approval date                280 non-null    object 
dtypes: float64(3), objec