In [50]:
import requests
from bs4 import BeautifulSoup
import time
import lxml.etree as ET
import csv
import pandas as pd
from datetime import datetime

In [None]:
# Uncomment and run once. Don't need to if you already have selenium.
# !pip install selenium
# !apt-get update
# !apt install chromium-chromedriver

In [61]:
from selenium import webdriver

def driversetup():
    options = webdriver.ChromeOptions()
    #run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    #overcome limited resource problems
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("lang=en")
    #open Browser in maximized mode
    options.add_argument("start-maximized")
    #disable infobars
    options.add_argument("disable-infobars")
    #disable extension
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(options=options)

    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")

    return driver
driver = driversetup()

In [52]:
# URL of the page containing the table
url = 'https://338canada.com/districts.htm'

# Fetch the page
response = requests.get(url)
html_content = response.content

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize a list to store links
links = []

# Loop through each row in the table to find and store all links
for row in soup.find_all('tr'):
    a_tag = row.find('td', align='left').find('a', href=True) if row.find('td', align='left') else None
    if a_tag and a_tag.has_attr('href'):
        links.append(a_tag['href'].strip())

# Determine how many links you want to process, e.g., first 5 links
n = 339

In [53]:
# Initialize lists to store SVG contents and district numbers
svg_contents = []
district_numbers = []

['https://338canada.com/10001e.htm', 'https://338canada.com/10002e.htm', 'https://338canada.com/10003e.htm', 'https://338canada.com/10004e.htm', 'https://338canada.com/10005e.htm', 'https://338canada.com/10006e.htm', 'https://338canada.com/10007e.htm', 'https://338canada.com/11001e.htm', 'https://338canada.com/11002e.htm', 'https://338canada.com/11003e.htm', 'https://338canada.com/11004e.htm', 'https://338canada.com/12001e.htm', 'https://338canada.com/12002e.htm', 'https://338canada.com/12003e.htm', 'https://338canada.com/12004e.htm', 'https://338canada.com/12005e.htm', 'https://338canada.com/12006e.htm', 'https://338canada.com/12007e.htm', 'https://338canada.com/12008e.htm', 'https://338canada.com/12009e.htm', 'https://338canada.com/12010e.htm', 'https://338canada.com/12011e.htm', 'https://338canada.com/13001e.htm', 'https://338canada.com/13002e.htm', 'https://338canada.com/13003e.htm', 'https://338canada.com/13004e.htm', 'https://338canada.com/13005e.htm', 'https://338canada.com/1300

In [62]:
# Process only the first n links
for index, link in enumerate(links[:n]):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Construct SVG ID for each link based on its index
    svg_id = f'ridinghisto-{index}'
    svg_element = soup.find('svg', id=svg_id)

    # Save the SVG's XML content as a string in the list
    if str(svg_element) == "None":
      print(link)
      driver.get(link)
      soup = BeautifulSoup(driver.page_source, 'html.parser')
      svg_element = soup.find('svg', id=svg_id)

    svg_contents.append(str(svg_element))


https://338canada.com/24068e.htm


In [64]:
for index, link in enumerate(links[:n]):
    # Extract district number from the link, removing the 'e.htm' part
    district_number = link.split('/')[-1].replace('e.htm', '')

    district_numbers.append(district_number)


In [65]:
# Combine all SVG contents into one string, separated by newlines
combined_svg_contents = "\n".join(svg_contents)
filename = "all_svgs_combined.txt"
with open(filename, 'w') as file:
    file.write(combined_svg_contents)
print(f"All SVG contents have been saved to {filename}.")

All SVG contents have been saved to all_svgs_combined.txt.


In [66]:
color_to_party = {
    '#d90000': 'LPC',
    '#0202ff': 'CPC',
    '#E17C0D': 'NDP',
    '#551A8B': 'PPC',
    '#269b26': 'GPC',
    '#12bbff': 'BQ',
    '#606060': 'IND',
    '#84BD00': 'MAV',
}

def extract_projections_with_party_names_and_split(svg_content):
    soup = BeautifulSoup(svg_content, 'html.parser')
    all_projections = []

    svg_elements = soup.find_all('svg')
    for svg in svg_elements:
        proj_data = {}

        text_elements = svg.find_all('text')
        for element in text_elements:
            if "±" in element.text and element.get('x') == "307.2":
                color = element['fill']

                text_parts = element.text.strip().split("±")
                percentage = text_parts[0].strip().rstrip('%')
                margin = text_parts[1].strip().rstrip('%')

                party_name = color_to_party.get(color, "Unknown")

                proj_data[f'{party_name} %'] = percentage
                proj_data[f'{party_name} Margin'] = margin

        all_projections.append(proj_data)

    return all_projections

# Read SVG contents from 'all_svgs_combined.txt'
with open('all_svgs_combined.txt', 'r') as file:
    svg_content = file.read()

# Extract projection data using the read SVG content
projection_data = extract_projections_with_party_names_and_split(svg_content)

In [67]:
# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(projection_data)
df['District Number'] = pd.Series(district_numbers)
columns = ['District Number'] + [col for col in df.columns if col != 'District Number']
df = df[columns]

today_date = datetime.now().strftime("%Y-%m-%d")
excel_filename = f"projection_data_{today_date}.xlsx"
csv_filename = f"projection_data_{today_date}.csv"

df.to_excel(excel_filename, index=False)
df.to_csv(csv_filename, index=False)

print("Data has been saved to 'projection_data.xlsx' and 'projection_data.csv'.")

Data has been saved to 'projection_data.xlsx' and 'projection_data.csv'.
