In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import time

To scrape data from the retired tab from the BC offsets registry, use code below. Note: there are 65 pages to scrape 

In [None]:
# If there's errors in the driver try doing this in the terminal and then running the commented code below
# rm -rf ~/.wdm/drivers/chromedriver/ (deletes the cached driver)
'''
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Force webdriver-manager to get the latest compatible driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

print("ChromeDriver is working!")
driver.quit()
'''

ChromeDriver is working!


Found the number of items on the page by looking at the url pattern. In this case the 'start' changed by 15 for each page. I.e. the view is limited to 15 items per page

In [None]:
# Set up Selenium WebDriver using ChromeDriverManager (A WebDriver is a tool used to automate web browsers. It allows programs to control a browser)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Define the base URL with {} placeholder to insert the page number dynamically
# Account Holders
#npage=6
#base_url = "https://carbonregistry.gov.bc.ca/br-reg/public/bc/index.jsp?entity=account&name=&standardId=&acronym=&unitClass=&sort=account_name&dir=ASC&start={}"
# Projects
#npage = 2
#base_url = "https://carbonregistry.gov.bc.ca/br-reg/public/bc/index.jsp?entity=project&name=&standardId=&acronym=&unitClass=&sort=project_name&dir=ASC&start={}"
# Issuances 
#npage = 8
#base_url = "https://carbonregistry.gov.bc.ca/br-reg/public/bc/index.jsp?entity=issuance&name=&standardId=&acronym=&unitClass=&sort=account_name&dir=ASC&start={}"
# Holdings
npage=6
base_url = "https://carbonregistry.gov.bc.ca/br-reg/public/bc/index.jsp?entity=holding&name=&standardId=&acronym=&unitClass=&sort=account_name&dir=ASC&start={}"
# Retirements
#base_url = "https://carbonregistry.gov.bc.ca/br-reg/public/bc/index.jsp?entity=retirement&name=&standardId=&acronym=&unitClass=&sort=retirement_date&dir=ASC&start={}"
# Cancellations
#base_url = "https://carbonregistry.gov.bc.ca/br-reg/public/bc/index.jsp?entity=Cancelled&name=&standardId=&acronym=&unitClass=&sort=retirement_date&dir=ASC&start={}"

# Function to scrape data from one page
def scrape_page(page_num):
    # Create the URL for the given page number
    url = base_url.format(page_num * 15)
    
    # Navigate to the page using Selenium
    driver.get(url)
    
    # Wait for the page to load
    time.sleep(10)  # Adjust if necessary for slower pages
    
    # Get the page source after JavaScript has loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find the table
    table = soup.find('table')  # Look for the table tag

    if table is None:
        print(f"Table not found on page {page_num + 1}")
        return []

    # Extract all rows in the table
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    # Create a list to store the table data
    data = []
    
    # Loop through each row and extract the cell data
    for row in rows:
        cells = row.find_all('td')
        row_data = [cell.get_text(strip=True) for cell in cells]
        data.append(row_data)
    
    return data

# Scrape the first 5 pages (adjust the range to scrape more)
all_data = []
for page in range(npage):  # Change to 65 for full scraping
    print(f"Scraping page {page + 1}...")
    page_data = scrape_page(page)
    if page_data:
        all_data.extend(page_data)

# Close the browser
driver.quit()

Scraping page 1...


In [None]:
tab_var = 'holdings'
# Account Holders
#cols=['Account Name', 'Classification', 'Website']
# Projects
#cols=['Name', 'Project Type', 'Status', 'Validator', 'Proponent', 'Details']
# Issuances / Listings
# cols=["Vintage", "Project", "Account", "Project Type", "Verifier", "Units", 'Measurement', "Details"]
# Holdings
cols=["Vintage", "Project", "Account", "Standard", "Project Type", "Verifier", "Units", 'Measurement', 'Type', "Details"]
# Retirements
#cols=["Retirement Date", "Vintage", "Project", "Account", "Project Type", "Retirement Quantity", 'Measurement', 'Details']
# Cancellations
#cols=["Cancellation Date", "Vintage", "Project", 'Account',"Standard", "Project Type", "Cancellation Quantity", 'Measurement', 'Type','Details']
df = pd.DataFrame(all_data, columns=cols)
df.to_csv(f'../data/offset_registries/BCcarbon_registry/BCcarbon_registry_{tab_var}.csv', index=False)