### Setup

In [44]:
pip install jupysql duckdb-engine

Note: you may need to restart the kernel to use updated packages.


##### DuckDB Connection

In [45]:
import pandas as pd
import numpy as np
import duckdb

conn = duckdb.connect("my_database.db")


### For data source 1: https://data.cer.gov.au/datasets/NGER/ID0243

##### Data Preparation

In [None]:
# Load the electricity sector emissions and generation data for the last ten years (2014 - 2024)
import requests
import pandas as pd
import time

def find_similar_column(target_col, available_cols):
    target_lower = target_col.lower()
    
    for col in available_cols:
        col_lower = col.lower()
        if target_lower == col_lower:
            return col
        
        target_clean = target_lower.replace('_', '').replace(' ', '').replace('-', '')
        col_clean = col_lower.replace('_', '').replace(' ', '').replace('-', '')
        
        if target_clean == col_clean:
            return col
        if target_clean in col_clean or col_clean in target_clean:
            return col
    
    return None

def find_similar_column(target_col, available_cols):
    target_lower = target_col.lower()
    
    for col in available_cols:
        col_lower = col.lower()
        
        # Exact match (case insensitive)
        if target_lower == col_lower:
            return col
        
        # Remove underscores, spaces, and common separators for comparison
        target_clean = target_lower.replace('_', '').replace(' ', '').replace('-', '')
        col_clean = col_lower.replace('_', '').replace(' ', '').replace('-', '')
        
        # Exact match after cleaning
        if target_clean == col_clean:
            return col
        
        # Bidirectional containing match
        if target_clean in col_clean or col_clean in target_clean:
            return col
    
    return None

def fetch_nger_data():
    urls = [
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0075?select%3D%2A",  # 2014
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0076?select%3D%2A",  # 2015
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0077?select%3D%2A",  # 2016
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0078?select%3D%2A",  # 2017
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0079?select%3D%2A",  # 2018
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0080?select%3D%2A",  # 2019
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0081?select%3D%2A",  # 2020
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0082?select%3D%2A",  # 2021
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0083?select%3D%2A",  # 2022
        "https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0243?select%3D%2A"   # 2023-2024
    ]
    
    years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023-2024]
    all_data = []
    base_columns = None
    
    for i, url in enumerate(urls):
        year = years[i]
        print(f"Fetching {year} data...")
        
        try:
            response = requests.get(url, timeout=30)
            data = response.json()
            
            if data:
                current_columns = list(data[0].keys())
                print(f"  Columns ({len(current_columns)}): {current_columns}")
                
                if i == len(urls) - 1:  # Last year (2023-2024) as base
                    base_columns = current_columns
                    print(f"  Base columns set (latest year)")
                    
                    for record in data:
                        record['data_year'] = year
                        all_data.append(record)
                else:
                    # Store data for later processing
                    if 'temp_data' not in locals():
                        temp_data = {}
                    temp_data[year] = {'data': data, 'columns': current_columns}
                
                print(f"  Processed {len(data)} records")
            else:
                print(f"  No data")
                
        except Exception as e:
            print(f"  Failed: {e}")
        
        time.sleep(0.5)
    
    # Process earlier years using the base columns from latest year
    if base_columns and 'temp_data' in locals():
        print(f"\nProcessing earlier years with base columns from 2023-2024:")
        for year, year_info in temp_data.items():
            print(f"  Mapping {year} columns:")
            data = year_info['data']
            current_columns = year_info['columns']
            
            column_mapping = {}
            for base_col in base_columns:
                similar_col = find_similar_column(base_col, current_columns)
                if similar_col:
                    column_mapping[base_col] = similar_col
                    if base_col != similar_col:
                        print(f"    {similar_col} -> {base_col}")
                else:
                    print(f"    {base_col} -> NOT FOUND")
            
            # Convert data using mappings
            for record in data:
                new_record = {'data_year': year}
                for base_col in base_columns:
                    source_col = column_mapping.get(base_col, base_col)
                    new_record[base_col] = record.get(source_col, None)
                all_data.append(new_record)
    
    # Create DataFrame
    if all_data and base_columns:
        df = pd.DataFrame(all_data)
        columns_order = base_columns + ['data_year']
        df = df[columns_order]
        
        print(f"\nCompleted: {len(df)} records, {len(df.columns)} columns")
        return df
    else:
        print("No data retrieved")
        return pd.DataFrame()

# Run
if __name__ == "__main__":
    df = fetch_nger_data()
    
    if not df.empty:
        df.to_csv('nger_data.csv', index=False)
        print(f"Saved to nger_data.csv")
        
        print(f"\nData preview:")
        print(df.head())
        
df = pd.read_csv("NGER.ID0243.csv")
print(df.shape) # Check the shape of the dataframe
columns = list(df) # Get the column names
columns

(775, 14)


['Reporting entity',
 'Facility name',
 'Type',
 'State',
 'Electricity production GJ',
 'Electricity production MWh',
 'Total scope 1 emissions t CO2 e',
 'Total scope 2 emissions t CO2 e',
 'Total emissions t CO2 e',
 'Emission intensity t CO2 e MWh',
 'Grid connected',
 'Grid',
 'Primary fuel',
 'Important notes']

In [47]:
df.head(5)

Unnamed: 0,Reporting entity,Facility name,Type,State,Electricity production GJ,Electricity production MWh,Total scope 1 emissions t CO2 e,Total scope 2 emissions t CO2 e,Total emissions t CO2 e,Emission intensity t CO2 e MWh,Grid connected,Grid,Primary fuel,Important notes
0,ACCIONA ENERGY OCEANIA PTY LTD,Cathedral Rocks Wind Farm,F,SA,481948,133874,57,127,184,0.0,On,NEM,Wind,-
1,ACCIONA ENERGY OCEANIA PTY LTD,Gunning Wind Farm,F,NSW,491409,136502,50,218,268,0.0,On,NEM,Wind,-
2,ACCIONA ENERGY OCEANIA PTY LTD,Mortlake South Wind Farm,F,VIC,1019352,283153,202,1128,1330,0.0,On,NEM,Wind,-
3,ACCIONA ENERGY OCEANIA PTY LTD,Mt Gellibrand Wind Farm,F,VIC,1025451,284847,99,1273,1372,0.0,On,NEM,Wind,-
4,ACCIONA ENERGY OCEANIA PTY LTD,Waubra Wind Farm,F,VIC,1954964,543046,186,1114,1300,0.0,On,NEM,Wind,-


##### Data Cleaning

In [48]:
df.dtypes

Reporting entity                    object
Facility name                       object
Type                                object
State                               object
Electricity production GJ            int64
Electricity production MWh           int64
Total scope 1 emissions t CO2 e      int64
Total scope 2 emissions t CO2 e      int64
Total emissions t CO2 e              int64
Emission intensity t CO2 e MWh     float64
Grid connected                      object
Grid                                object
Primary fuel                        object
Important notes                     object
dtype: object

In [49]:
wrk_df = df.copy() #Make a working copy of the dataset
wrk_df = wrk_df.drop(columns=['Important notes']) # Drop the 'Important notes' column as it is not needed for analysis
wrk_df.replace('-', np.nan, inplace=True) # Replace '-' with NaN for easier handling of missing values
print(wrk_df.shape) # Check the shape of the cleaned dataframe

(775, 13)


In [50]:
conn.sql("CREATE TABLE IF NOT EXISTS electricity_emissions AS SELECT * FROM wrk_df")
conn.sql("SELECT * FROM electricity_emissions").df()

Unnamed: 0,Reporting entity,Facility name,Type,State,Electricity production GJ,Electricity production MWh,Total scope 1 emissions t CO2 e,Total scope 2 emissions t CO2 e,Total emissions t CO2 e,Emission intensity t CO2 e MWh,Grid connected,Grid,Primary fuel
0,ACCIONA ENERGY OCEANIA PTY LTD,Cathedral Rocks Wind Farm,F,SA,481948,133874,57,127,184,0.0,On,NEM,Wind
1,ACCIONA ENERGY OCEANIA PTY LTD,Gunning Wind Farm,F,NSW,491409,136502,50,218,268,0.0,On,NEM,Wind
2,ACCIONA ENERGY OCEANIA PTY LTD,Mortlake South Wind Farm,F,VIC,1019352,283153,202,1128,1330,0.0,On,NEM,Wind
3,ACCIONA ENERGY OCEANIA PTY LTD,Mt Gellibrand Wind Farm,F,VIC,1025451,284847,99,1273,1372,0.0,On,NEM,Wind
4,ACCIONA ENERGY OCEANIA PTY LTD,Waubra Wind Farm,F,VIC,1954964,543046,186,1114,1300,0.0,On,NEM,Wind
...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,WIRTGEN ENERGY GLENROWAN PTY. LTD,Corporate Total,C,,766484,212912,60,543,603,,,,
771,WOOLOOGA HOLDCO 2 PTY LTD,Woolooga SF,F,QLD,1185741,329373,59,3658,3717,0.0,On,NEM,Solar
772,WOOLOOGA HOLDCO 2 PTY LTD,Corporate Total,C,,1185741,329373,59,3658,3717,,,,
773,YATPOOL SOLAR FARM HOLDCO PTY LTD,YATPOOL SOLAR FARM,F,VIC,461024,128062,0,361,361,0.0,On,NEM,Solar


### For data source 2: https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data


##### Data Retrieval

In [51]:
#Web Scraping to find CSV links on the CER page 
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a')
for link in links:
    href = link.get('href')
    if href and href.startswith('/document/'):
        if href.endswith('accredited') or href.endswith('committed') or href.endswith('probable'):
            response = requests.get("https://cer.gov.au" + href)
            print(href)
            filename = os.path.basename(href) + ".csv"
            if os.path.exists(filename):
                print(f"{filename} already exists. Skipping download.")
                continue
            else:
                with open(filename, "wb") as f:
                    f.write(response.content)
            

/document/power-stations-and-projects-accredited
power-stations-and-projects-accredited.csv already exists. Skipping download.
/document/power-stations-and-projects-committed
power-stations-and-projects-committed.csv already exists. Skipping download.
/document/power-stations-and-projects-probable
power-stations-and-projects-probable.csv already exists. Skipping download.


In [52]:
accredited_power_station_df = pd.read_csv("power-stations-and-projects-accredited.csv")
committed_power_station_df = pd.read_csv("power-stations-and-projects-committed.csv")
probable_power_station_df = pd.read_csv("power-stations-and-projects-probable.csv")
accredited_power_station_df.head(5)

Unnamed: 0,Accreditation code,Power station name,State,Postcode,Installed capacity (MW),Fuel Source (s),Accreditation start date,Approval date
0,SRPXQLE8,"Laura Johnson Home, Townview - Solar w SGU - QLD",QLD,4825,0.2265,Solar,15/10/2024,13/01/2025
1,SRPYNS39,Leppington - Solar - NSW,NSW,2179,0.732,Solar,22/11/2024,13/01/2025
2,SRPYNS58,Quakers Hillside Care Community - Solar w SGU ...,NSW,2763,0.1996,Solar,19/12/2024,13/01/2025
3,SRPXVCN4,Rest Nominees - Solar wSGU - VIC,VIC,3008,0.1188,Solar,20/09/2024,13/01/2025
4,SRPXQLF9,Retail First Mt Ommaney-Solar-QLD,QLD,4074,1.0004,Solar,29/10/2024,13/01/2025


In [53]:
accredited_power_station_df.dtypes

Accreditation code           object
Power station name           object
State                        object
Postcode                      int64
Installed capacity (MW)     float64
Fuel Source (s)              object
Accreditation start date     object
Approval date                object
dtype: object

In [54]:
accredited_power_station_df['Accreditation start date'] = pd.to_datetime(accredited_power_station_df['Accreditation start date'], format="%d/%m/%Y")
accredited_power_station_df['Approval date'] = pd.to_datetime(accredited_power_station_df['Approval date'], format="%d/%m/%Y")
accredited_power_station_df.dtypes

Accreditation code                  object
Power station name                  object
State                               object
Postcode                             int64
Installed capacity (MW)            float64
Fuel Source (s)                     object
Accreditation start date    datetime64[ns]
Approval date               datetime64[ns]
dtype: object

In [55]:
committed_power_station_df.dtypes

Project Name                    object
State                           object
MW Capacity                    float64
Fuel Source                     object
Committed Date (Month/Year)     object
dtype: object

In [56]:
probable_power_station_df.dtypes

Project Name     object
State            object
MW Capacity     float64
Fuel Source      object
dtype: object

In [57]:
conn.sql("drop table if exists accredited_power_stations")
conn.sql("CREATE TABLE IF NOT EXISTS accredited_power_stations AS SELECT *, 'accredited' AS project_status FROM accredited_power_station_df")
conn.sql("select * from accredited_power_stations").df()

Unnamed: 0,Accreditation code,Power station name,State,Postcode,Installed capacity (MW),Fuel Source (s),Accreditation start date,Approval date,project_status
0,SRPXQLE8,"Laura Johnson Home, Townview - Solar w SGU - QLD",QLD,4825,0.2265,Solar,2024-10-15,2025-01-13,accredited
1,SRPYNS39,Leppington - Solar - NSW,NSW,2179,0.7320,Solar,2024-11-22,2025-01-13,accredited
2,SRPYNS58,Quakers Hillside Care Community - Solar w SGU ...,NSW,2763,0.1996,Solar,2024-12-19,2025-01-13,accredited
3,SRPXVCN4,Rest Nominees - Solar wSGU - VIC,VIC,3008,0.1188,Solar,2024-09-20,2025-01-13,accredited
4,SRPXQLF9,Retail First Mt Ommaney-Solar-QLD,QLD,4074,1.0004,Solar,2024-10-29,2025-01-13,accredited
...,...,...,...,...,...,...,...,...,...
275,SRPXVCT8,Liuzzi 71 Gower St - Solar - VIC,VIC,3072,0.4000,Solar,2025-07-21,2025-08-27,accredited
276,SRPXQLM5,Mercy Community - Solar w SGU - QLD,QLD,4014,0.8530,Solar,2025-08-04,2025-08-27,accredited
277,SRPXVCP9,Norther Airfield - Solar - VIC,VIC,3045,11.2780,Solar,2025-06-01,2025-08-27,accredited
278,SRPVWAN8,Rose Farms - Solar wSGU- WA,WA,6220,0.2000,Solar,2025-08-04,2025-08-27,accredited


In [58]:
conn.sql("create table if not exists committed_power_stations as select *, 'committed' as project_status from committed_power_station_df;")
conn.sql("select * from committed_power_stations;").df()

Unnamed: 0,Project Name,State,MW Capacity,Fuel Source,Committed Date (Month/Year),project_status
0,East Rockingham Resource Recovery Facility,WA,29.0,Biomass,Dec-2019,committed
1,Mangalore Renewable Energy Project,VIC,5.0,Solar,Sep-2021,committed
2,Orange Community Renewable Energy Park,NSW,5.0,Solar,Jul-2022,committed
3,Moorebank Logistics Park,NSW,60.0,Solar,Sep-2022,committed
4,Wangaratta Solar Farm,VIC,40.0,Solar,Jul-2023,committed
5,Kidston Pumped Hydro Storage Project,QLD,250.0,Hydro,Jul-2023,committed
6,New England Solar Farm - Stage 2,NSW,320.0,Solar,Jul-2023,committed
7,Bellevue Gold Hybrid Power Station (Wind),WA,24.0,Wind,Sep-2023,committed
8,Glenellen solar project,NSW,200.0,Solar,Nov-2023,committed
9,Forest Glen Solar Farm,NSW,90.0,Solar,Dec-2023,committed


In [59]:
conn.sql("create table if not exists probable_power_stations as select *, 'probable' as project_status from probable_power_station_df;")
conn.sql("select * from probable_power_stations;").df()

Unnamed: 0,Project Name,State,MW Capacity,Fuel Source,project_status
0,Barnawartha Solar Farm,VIC,64.0,Solar,probable
1,Barwon solar farm,VIC,250.0,Solar,probable
2,Boddington Giga Energy,WA,400.0,Solar,probable
3,Bulli Creek Solar project Stage 1,QLD,775.0,Solar,probable
4,Bullyard Solar Farm,QLD,100.0,Solar,probable
5,Bungaban Wind Farm,QLD,1400.0,Wind,probable
6,Byford Solar Project,WA,30.0,Solar,probable
7,Campbells Forest Solar Farm,VIC,205.0,Solar,probable
8,Coppabella Wind Farm,NSW,284.0,Wind,probable
9,Derby Solar Farm & Battery,VIC,95.0,Solar,probable


In [60]:
conn.query("SHOW TABLES").df()

Unnamed: 0,name
0,accredited_power_stations
1,committed_power_stations
2,electricity_emissions
3,probable_power_stations


In [61]:
import requests
def geocode_address(address):
    API_KEY = "AIzaSyDy1sAL8Lepu8JgRbGnLLGFKu8FE8E-krU"
    response = requests.get(f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={API_KEY}")
    geometry = response.json().get("results", [])[0].get("geometry", {}).get("location", {})
    lat, lng = geometry.get("lat"), geometry.get("lng")
    #print(f"Address: {address}, Latitude: {lat}, Longitude: {lng}") 
    return lat, lng
    

lat, lng = geocode_address("Barnawartha Solar Farm")
lat, lng

(-36.0959962, 146.6965122)

In [62]:
%%sql
create view if not exists power_stations as   
select
    name,
    location,
    capacity,
    type
from
    accredited_power_stations
union
select
    `Project name` as name,
    `Location` as location,
    `Capacity (MW)` as capacity,
    `Generation technology` as type 
from
    committed_power_stations
union
select
    `Project name` as name,
    `Location` as location,
    `Capacity (MW)` as capacity,
    `Generation technology` as type
from
    probable_power_station;

UsageError: Cell magic `%%sql` not found.


### For data source 3: https://www.abs.gov.au/methodologies/data-region-methodology/2011-24#data-downloads