In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

import pickle

CHROME_OPTIONS = webdriver.ChromeOptions()
CHROME_OPTIONS.add_argument("--headless")

In [None]:
def load_cookies(driver, cookies):
    for cookie in cookies:
        driver.add_cookie(cookie)

In [None]:
# Get fivethirtyeight data

def scrape_fivethirtyeight(url = "https://projects.fivethirtyeight.com/polls/president-general/2024/national/"):
    d = webdriver.Chrome(CHROME_OPTIONS)
    
    try:
        d.get(url)
        WebDriverWait(d, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".label-group")))
        items = d.find_element(By.CSS_SELECTOR, ".label-group").find_elements(By.TAG_NAME, "text")
        results = []
        for item in items[:2]:
            results.append(item.text)
        return results
    except TimeoutException:
        raise TimeoutException
    except StaleElementReferenceException:
        raise StaleElementReferenceException
    finally:
        d.quit()

In [None]:
# Get realclearpolling data

def scrape_realclearpolling(url = "https://www.realclearpolling.com/polls/president/general/2024/trump-vs-harris-vs-kennedy-vs-stein-vs-west"):
    d = webdriver.Chrome(CHROME_OPTIONS)
    
    try:
        d.get(url)
        WebDriverWait(d, 30).until(
            EC.presence_of_element_located((By.TAG_NAME, "table")) and
            (lambda d: len(d.find_elements(By.TAG_NAME, "td")) > 5)
        )
        tds = d.find_elements(By.TAG_NAME, "td")
        results = [f'Harris {tds[4].text}%', f'Trump {tds[5].text}%']
        return results

    except TimeoutException:
        raise TimeoutException
    except StaleElementReferenceException:
        raise StaleElementReferenceException
    finally:
        d.quit()

In [None]:
# Get NYT data

import time

def scrape_nyt(url = "https://www.nytimes.com/interactive/2024/us/elections/polls-president.html"):
    def clean_text(text):
        parts = text.split('\n')
        print(parts)
        return " ".join([parts[1], parts[0]])
    
    d = webdriver.Chrome(CHROME_OPTIONS)

    try:
        d.get(url)

        # Load cookies
        with open('nyt_cookies.pkl', "rb") as f:
            cookies = pickle.load(f)
        load_cookies(d, cookies)

        # Refresh the page to apply the cookies
        d.refresh()
        
        # Now navigate to the desired URL
        d.get(url)
        
        WebDriverWait(d, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#summaryharris .multi-buttons")))

        actions = ActionChains(d)
        button = WebDriverWait(d, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#summaryharris .multi-buttons button:nth-child(2)")))
        
        actions.move_to_element(button)
        actions.click()
        actions.perform()
        
        WebDriverWait(d, 30).until(
            lambda d:
                len(d.find_elements(By.CSS_SELECTOR, "#summaryharris .primary-matchup .g-endlabel-inner .g-value")) == 3 and
                all([len(item.text.strip()) for item in d.find_elements(By.CSS_SELECTOR, "#summaryharris .primary-matchup .g-endlabel-inner .g-value")])
        )

        print(len(d.find_elements(By.CSS_SELECTOR, "#summaryharris .primary-matchup .g-endlabel-inner .g-value")))
        
        items = d.find_elements(By.CSS_SELECTOR, "#summaryharris .primary-matchup .g-endlabel-inner")
        results = [clean_text(item.text) for item in items if 'Kennedy' not in item.text]
        return results
    except TimeoutException:
        raise TimeoutException
    except StaleElementReferenceException:
        raise StaleElementReferenceException
    finally:
        d.quit()

In [None]:
# Get Nate Silver data

def scrape_natesilver(url = "https://www.natesilver.net/p/nate-silver-2024-president-election-polls-model"):
    d = webdriver.Chrome(CHROME_OPTIONS)

    try:
        d.get(url)
        d.switch_to.frame(d.find_element(By.CSS_SELECTOR, "#iframe-datawrapper"))
        WebDriverWait(d, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".d3l-line-labels")))
        items = d.find_element(By.CSS_SELECTOR, ".d3l-line-labels").find_elements(By.CSS_SELECTOR, ".d3l-line-label")
        results = []
        for item in items:
            if "Kennedy" in item.text:
                continue
            clean_text = item.text.split(' ')[-1].replace("\n", " ")
            results.append(clean_text)
        return results
    except TimeoutException:
        raise TimeoutException
    except StaleElementReferenceException:
        raise StaleElementReferenceException
    finally:
        d.quit()

In [None]:
def convert_to_dict(data):
    return { item.split()[0]: item.split()[1] for item in data }

AGGREGATOR_MAP = {
    'fivethirtyeight': scrape_fivethirtyeight,
    'realclearpolling': scrape_realclearpolling,
    'nyt': scrape_nyt,
    'natesilver': scrape_natesilver,
}


def get_averages(urls = {}):
    averages = {}

    for aggregator, scraper in AGGREGATOR_MAP.items():
        url = urls.get(aggregator, None)
        if url:
            averages[aggregator] = convert_to_dict(scraper(url))
        else:
            averages[aggregator] = convert_to_dict(scraper())

    return averages

In [None]:
import pandas as pd
import numpy as np

In [121]:
df = pd.DataFrame(get_averages())

df['date'] = pd.Timestamp.now().strftime('%Y-%m-%d')
df['candidate'] = df.index
df = df.set_index('date')

df = df[['candidate', 'fivethirtyeight', 'realclearpolling', 'nyt', 'natesilver']]

df

KeyboardInterrupt: 

In [None]:
df.to_csv('polls.csv', mode='a', header=False)

In [None]:
from datetime import datetime

WAYBACK_URL_TEMPLATE = "https://web.archive.org/web/{date}/{url}"

In [None]:
def scrape_historical_data(start_date: str, end_date: str = None):
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    
    date_range = pd.date_range(start=start_date, end=end_date)
    all_data = []
    
    for date in date_range:
        formatted_date = date.strftime('%Y%m%d')
        print(f"Scraping data for {date.strftime('%Y-%m-%d')}")
        print(WAYBACK_URL_TEMPLATE.format(date=formatted_date, url="https://www.realclearpolling.com/polls/president/general/2024/trump-vs-harris-vs-kennedy-vs-stein-vs-west"))
        print(WAYBACK_URL_TEMPLATE.format(date=formatted_date, url="https://www.nytimes.com/interactive/2024/us/elections/polls-president.html"))
        print(WAYBACK_URL_TEMPLATE.format(date=formatted_date, url="https://www.natesilver.net/p/nate-silver-2024-president-election-polls-model"))
        
        # Scrape fivethirtyeight data
        fivethirtyeight_url = WAYBACK_URL_TEMPLATE.format(date=formatted_date, url="https://projects.fivethirtyeight.com/polls/president-general/2024/national/")
        fivethirtyeight_data = scrape_fivethirtyeight(fivethirtyeight_url)
        
        # Scrape realclearpolling data
        realclearpolling_url = WAYBACK_URL_TEMPLATE.format(date=formatted_date, url="https://www.realclearpolling.com/polls/president/general/2024/trump-vs-harris")
        realclearpolling_data = scrape_realclearpolling(realclearpolling_url)
        
        # Organize the data into a structured format
        data = {
            'date': date.strftime('%Y-%m-%d'),
            'fivethirtyeight': fivethirtyeight_data,
            'realclearpolling': realclearpolling_data
        }
        all_data.append(data)
    
    return pd.DataFrame(all_data)

#historical_data = scrape_historical_data('2024-07-21', '2024-07-22')