### Imports

In [None]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

### Directory Definitions

In [None]:
data_dir = "html_files/"
formatted_data_dir = "formatted_cases/"
case_data_path = "case_data/cases.csv"
scraped_searches_path = "scraped_searches/"

# Create the directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Create the directory if it doesn't exist
if not os.path.exists(formatted_data_dir):
    os.makedirs(formatted_data_dir)

# Unannotated Data

### Get URLs of every case

In [None]:
errors_list = []
wait_timeout = 10

decisions_by_year_path = "decisions_by_year/"
driver = webdriver.Chrome()
for year in range(2005,2023):
    url = "https://www.canlii.org/en/on/onltb/nav/date/" + str(year) + "/"
    driver.get(url)
    
    # Wait for the presence of the specific element with class "name"
    wait = WebDriverWait(driver, wait_timeout)
    wait.until(EC.presence_of_element_located((By.ID, "decisionsListing")))
    time.sleep(2)

    # Wait for the "Show more results" button to be present within the tbody element
    show_more_results_locator = (By.CSS_SELECTOR, "span.link.showMoreResults")
    wait.until(EC.presence_of_element_located(show_more_results_locator))

    while True:
        # Check if the "Show more results" button is present
        show_more_results_button = driver.find_element(*show_more_results_locator)
        if not show_more_results_button.is_displayed():
            break  # Exit the loop if the button is no longer displayed

        # Click the "Show more results" button
        show_more_results_button.click()

        # Wait for the page to load after clicking the button
        time.sleep(2)
        # wait = WebDriverWait(driver, wait_timeout)
        # wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='result-item']")))
    
    # Get all content, including dynamically generated content
    html_content = driver.page_source

    # Save the content to a file
    with open(decisions_by_year_path + str(year) + ".html", "w", encoding="utf-8") as file:
        file.write(html_content)

    print(year, "saved.")
    
driver.quit()    
    
    

### Put URLs into DF by year

urls_by_year_df = pd.DataFrame(columns=['year', 'case_URL'])

for file in os.listdir(decisions_by_year_path):
    if os.path.isfile(decisions_by_year_path + file):
        # try:
        if not file.startswith('.'):
            with open(decisions_by_year_path + file) as f:
                html = f.read()
                soup = BeautifulSoup(html, "html.parser")
                
                decisions_list = soup.find('tbody', id='decisionsListing')
                anchor_tags = decisions_list.find_all("a")
                for a in anchor_tags:
                    case_url = 'https://www.canlii.org/' + a['href']
                    data = {'year': [os.path.splitext(file)[0]], 'case_URL': [case_url]}
                    urls_by_year_df = urls_by_year_df.append(pd.DataFrame(data), ignore_index=True)


In [None]:
# urls_by_year_df[urls_by_year_df['year'] == "2008"]
urls_by_year_df

### All URLs to documents

Done:
- 2022

In [None]:
urls_by_year_df = urls_by_year_df[urls_by_year_df['year'] == "2022"]
urls_list = urls_by_year_df['case_URL'].tolist()
# Directory to save the HTML files
output_dir = "45k_scraped_html_files/"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
driver = webdriver.Chrome()
# Loop through the case IDs and URLs
for i, url in enumerate(urls_list):
    # Generate a file name based on the case ID
    filename = f"case_{i}.html"
    if os.path.isfile(os.path.join(output_dir, filename)):
        print(filename, "already added, skipping...")
    else:
        # Navigate to the URL
        driver.get(url)

        # Get the page source
        html_content = driver.page_source

        # Save the HTML content to a file
        with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as file:
            file.write(html_content)

        print(f"Saved HTML content for case ID {i}.")

# Close the WebDriver
driver.quit()

### Get data from html files

In [None]:
output_dir = "45k_scraped_html_files/"
formatted_data_dir = "45k_formatted_cases/"

for file in os.listdir(output_dir):
    try:
        if os.path.isfile(output_dir + file) and not file.startswith('.'):
            # if os.path.isfile(formatted_data_dir + case_ID + '.txt'):
            #     print(file, "already added, skipping...")
            # else:
            print("Adding ", file, "...")
            with open(output_dir + file) as f:
                html = f.read()
            soup = BeautifulSoup(html, "html.parser")

            # find metadata
            document_meta = soup.find("div", {"id": "documentMeta"}) 
            meta_items = document_meta.find_all("div", {"class": "row py-1"})

            case_ID = ""
            meta_data = []
            for meta_item in meta_items:
                children_text = []
                for x in meta_item.findChildren()[:2]:
                    children_text.append(x.text)
                child_string = '\t'.join(children_text)
                if "file number" in child_string.lower():
                    case_ID = child_string.split("\t")[1].strip()
                    # print(case_ID)
                meta_data.append(child_string)

            # print(meta_data)
            # find text
            document_body = soup.find("div", {"class": "documentcontent"}).get_text()

            # write to file
            with open(formatted_data_dir + case_ID + '.txt', 'w') as file:
                file.write('Metadata:\n')
                file.write('\n'.join(meta_data))
                file.write('Content:\n')
                file.write(document_body)
    except:
        print("Error with:", file)

# Annotated Data

### Get case IDs

In [231]:
df = pd.read_csv(case_data_path)
case_IDs = df['What is the file number of the case?'].tolist()
# case_IDs

In [None]:
# Set up Selenium WebDriver 
driver = webdriver.Chrome()

errors_list = []
wait_timeout = 10
for case_ID in case_IDs:
    if os.path.isfile(scraped_searches_path + case_ID + ".html"):
        print(case_ID, "already added, skipping...")
    else:
        try:
            url = "https://canlii.org/en/#search/id=" + case_ID
            driver.get(url)
            
            # Wait for the presence of the specific element with class "name"
            wait = WebDriverWait(driver, wait_timeout)
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, "name")))
            
            # Get all content, including dynamically generated content
            html_content = driver.page_source

            # Save the content to a file
            with open(scraped_searches_path + case_ID + ".html", "w", encoding="utf-8") as file:
                file.write(html_content)

            print(case_ID, "saved.")
        except Exception as e:
            errors_list.append(case_ID)
            print(f"Error occurred for case ID {case_ID}: {str(e)}")

# Close the WebDriver
driver.quit()


In [233]:
urls_df = pd.DataFrame(columns=['case_ID', 'case_URL'])

for file in os.listdir(scraped_searches_path):
    if os.path.isfile(scraped_searches_path + file):
        try:
            with open(scraped_searches_path + file) as f:
                html = f.read()
                soup = BeautifulSoup(html, "html.parser")
                # Find the <a> element within the <span> element
                a_element = soup.find('span', class_='name').find('a')

                # Extract the value of the href attribute
                href = a_element['href']
                case_url = 'https://www.canlii.org/' + href
                # print(case_url)
                data = {'case_ID': [os.path.splitext(file)[0]], 'case_URL': [case_url]}
                urls_df = urls_df.append(pd.DataFrame(data), ignore_index=True)
        except Exception as e:
            errors_list.append(case_ID)
            print(f"Error occurred for case ID {case_ID}: {str(e)}")

Error occurred for case ID TEL-01869-19: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


In [235]:
# urls_df.iloc[11, 1]
# urls_df

### Get all HTML files from URLS_DF

In [None]:
from selenium import webdriver
import os

# Set up Selenium WebDriver (you may need to download and configure the appropriate WebDriver for your browser)
driver = webdriver.Chrome()

# Get the lists of case IDs and URLs
ids_list = urls_df['case_ID'].tolist()
urls_list = urls_df['case_URL'].tolist()

# Directory to save the HTML files
output_dir = "scraped_html_files/"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through the case IDs and URLs
for case_id, url in zip(ids_list, urls_list):
    # Generate a file name based on the case ID
    filename = f"{case_id}.html"
    if os.path.isfile(os.path.join(output_dir, filename)):
        print(filename, "already added, skipping...")
    else:
        # Navigate to the URL
        driver.get(url)

        # Get the page source
        html_content = driver.page_source

        # Save the HTML content to a file
        with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as file:
            file.write(html_content)

        print(f"Saved HTML content for case ID {case_id}.")

# Close the WebDriver
driver.quit()


In [None]:
# ids_list = urls_df['case_ID'].tolist()
# urls_list = urls_df['case_URL'].tolist()
# output_dir = "scraped_html_files/"
# # urls_list

# for id, url in zip(ids_list, urls_list):
#     # Generate a file name based on the index
#     filename = f"{id}.html"
#     response = requests.get(url)
#     # Save the HTML content to a file
#     with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
#         file.write(response.text)

In [None]:
# driver.quit()

### Scraping from folder

In [None]:
output_dir = "scraped_html_files/"
formatted_data_dir = "formatted_cases/"
for file in os.listdir(output_dir):
    if os.path.isfile(output_dir + file):
        if os.path.isfile(formatted_data_dir + os.path.splitext(file)[0] + ".txt"):
            print(file, "already added, skipping...")
        else:
            if not file.startswith('.'):
                print("Adding ", file, "...")
                with open(output_dir + file) as f:
                    html = f.read()
                soup = BeautifulSoup(html, "html.parser")

                # find metadata
                document_meta = soup.find("div", {"id": "documentMeta"}) 
                meta_items = document_meta.find_all("div", {"class": "row py-1"})
                meta_data = []
                for meta_item in meta_items:
                    meta_data.append('\t'.join([x.text for x in meta_item.findChildren()]))
                # print(meta_data)
                # find text
                document_body = soup.find("div", {"class": "documentcontent"}).get_text()

                # write to file
                with open(formatted_data_dir + os.path.splitext(file)[0] + '.txt', 'w') as file:
                    file.write('Metadata:\n')
                    file.write('\n'.join(meta_data))
                    file.write('Content:\n')
                    file.write(document_body)