In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import time
import re

BASE = "https://www.pollofpolls.no/"
ALL_MONTHS = BASE + "?cmd=Stortinget&do=visallesnitt"
DELAY = 1.5   # politeness delay

party_map = {
    "Ap": "Ap",
    "H": "Høyre",
    "Frp": "Frp",
    "SV": "SV",
    "Sp": "Sp",
    "KrF": "KrF",
    "V": "Venstre",
    "MDG": "MDG",
    "R": "Rødt",
    "Andre": "Andre"
}

def get_soup(url):
    r = requests.get(url)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser"), r.text

def collect_month_links():
    soup, _ = get_soup(ALL_MONTHS)
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "cmd=Stortinget&do=snitt&yw=" in href:
            links.append(urljoin(BASE, href))
    return sorted(set(links))

def collect_poll_links_for_month(month_url):
    soup, _ = get_soup(month_url)
    poll_links = []

    # Normal institute sub-links (modern)
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if (
            "cmd=Stortinget&do=maling&pid=" in href
            or "cmd=Stortinget&do=snitt&dato=" in href
        ):
            poll_links.append(urljoin(BASE, href))

    # If no sub-links → treat the month page itself as a "poll" (monthly average)
    if not poll_links:
        poll_links = [month_url]

    return poll_links

def parse_poll_page(poll_url):
    soup, html = get_soup(poll_url)
    data = {"url": poll_url}

    # Heading with institute (if present)
    h1 = soup.find("h1")
    if h1:
        data["institute"] = h1.get_text(strip=True)
    else:
        data["institute"] = "Monthly average"

    # Default values
    data["respondents"] = None
    data["fieldwork"] = None
    data["date"] = None

    # Metadata text
    meta_text = soup.get_text(" ", strip=True)

    respondents = re.search(r"Antall spurte:? (\d+)", meta_text)
    if respondents:
        data["respondents"] = int(respondents.group(1))

    fieldwork = re.search(r"Feltperiode:? ([0-9.\-– ]+)", meta_text)
    if fieldwork:
        data["fieldwork"] = fieldwork.group(1).strip()

    # Poll date
    for tag in soup.find_all(["h2", "p"]):
        txt = tag.get_text(" ", strip=True)
        if re.search(r"\d{1,2}\.\s*[A-Za-zæøåÆØÅ]+ \d{4}", txt):
            data["date"] = txt
            break

    # Party table
    for table in soup.find_all("table"):
        headers = [th.get_text(strip=True) for th in table.find_all("th")]
        if any(h in headers for h in party_map.keys()):
            rows = table.find_all("tr")
            for row in rows[1:]:
                cols = [c.get_text(strip=True) for c in row.find_all(["td", "th"])]
                if len(cols) >= 2 and cols[0] in party_map:
                    party = party_map[cols[0]]
                    try:
                        data[party] = float(cols[1].replace(",", "."))
                    except:
                        data[party] = None
            break

    return data

def main():
    month_links = collect_month_links()
    print(f"Found {len(month_links)} months")

    all_polls = []
    for i, mlink in enumerate(month_links, 1):
        poll_links = collect_poll_links_for_month(mlink)
        print(f"[{i}/{len(month_links)}] {mlink} → {len(poll_links)} polls")

        # Debug: show first 3 poll URLs for the first 2 months
        if i <= 2:
            print("  Sample poll links:", poll_links[:3])

        for plink in poll_links:
            try:
                poll = parse_poll_page(plink)
                if poll:
                    all_polls.append(poll)
            except Exception as e:
                print("Error parsing", plink, e)
            time.sleep(DELAY)

    df = pd.DataFrame(all_polls)
    df.to_csv("stortingsvalg_polls_pollofpolls_full.csv", index=False, encoding="utf-8")
    print(f"Saved {len(df)} polls to stortingsvalg_polls_pollofpolls_full.csv")

if __name__ == "__main__":
    main()


Found 213 months
[1/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200801 → 1 polls
  Sample poll links: ['https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200801']
[2/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200802 → 1 polls
  Sample poll links: ['https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200802']
[3/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200803 → 1 polls
[4/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200804 → 1 polls
[5/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200805 → 1 polls
[6/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200806 → 1 polls
[7/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200807 → 1 polls
[8/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200808 → 1 polls
[9/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200809 → 1 polls
[10/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200810 → 1 polls
[11/213]

In [9]:
pip install selenium pandas

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
    --------------------------------------- 0.2/9.6 MB 3.5 MB/s eta 0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.32.2 requires rich<14,>=10.14.0, but you have rich 14.1.0 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

BASE = "https://www.pollofpolls.no/"
START_URL = BASE + "?cmd=Stortinget&do=visallesnitt"
DELAY = 1.5

PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def parse_poll_page(driver, url):
    driver.get(url)
    time.sleep(DELAY)
    data = {"url": url}

    # Heading
    try:
        h1 = driver.find_element(By.TAG_NAME,"h1")
        data["institute"] = h1.text
    except:
        data["institute"] = "Unknown"

    # Metadata
    body_text = driver.find_element(By.TAG_NAME,"body").text

    m = re.search(r"Antall spurte[:\s]*([0-9 ]+)", body_text)
    data["respondents"] = int(m.group(1).replace(" ","")) if m else None

    m = re.search(r"Feltperiode[:\s]*([0-9.\-– ]+)", body_text)
    data["fieldwork"] = m.group(1).strip() if m else None

    m = re.search(r"(\d{1,2}\.\s*[A-Za-zæøåÆØÅ]+ \d{4})", body_text)
    data["date"] = m.group(1) if m else None

    # Party table
    table_rows = driver.find_elements(By.XPATH,"//table//tr")
    for row in table_rows:
        cols = row.find_elements(By.XPATH,"./td|./th")
        if len(cols) >= 2:
            party_name = cols[0].text.strip()
            if party_name in PARTIES:
                try:
                    data[party_name] = float(cols[1].text.strip().replace(",","."))
                except:
                    data[party_name] = None
    return data

def main():
    driver = setup_driver()
    driver.get(START_URL)
    time.sleep(DELAY)

    # Collect all month links
    month_links = []
    for a in driver.find_elements(By.XPATH,"//a[contains(@href,'do=snitt&yw=')]"):
        month_links.append(a.get_attribute("href"))
    month_links = sorted(list(set(month_links)))
    print(f"Found {len(month_links)} months")

    all_polls = []

    for i, month_url in enumerate(month_links,1):
        driver.get(month_url)
        time.sleep(DELAY)

        # Collect poll links in this month
        poll_links = []
        for a in driver.find_elements(By.XPATH,"//a[contains(@href,'do=maling') or contains(@href,'do=snitt&dato')]"):
            poll_links.append(a.get_attribute("href"))

        print(f"[{i}/{len(month_links)}] {month_url} → {len(poll_links)} polls")
        if i <= 3:
            print("  Sample poll links:", poll_links[:5])

        for plink in poll_links:
            try:
                poll_data = parse_poll_page(driver, plink)
                all_polls.append(poll_data)
            except Exception as e:
                print("Error parsing", plink, e)

    driver.quit()
    df = pd.DataFrame(all_polls)
    df.to_csv("stortingsvalg_polls_pollofpolls_full.csv", index=False, encoding="utf-8")
    print(f"Saved {len(df)} polls to CSV")

if __name__=="__main__":
    main()


Found 213 months
[1/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200801 → 0 polls
  Sample poll links: []
[2/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200802 → 0 polls
  Sample poll links: []
[3/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200803 → 0 polls
  Sample poll links: []
[4/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200804 → 0 polls
[5/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200805 → 0 polls
[6/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200806 → 0 polls
[7/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200807 → 0 polls
[8/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200808 → 0 polls
[9/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200809 → 0 polls
[10/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200810 → 0 polls
[11/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200811 → 0 polls
[12/213] https://www.pollofpo

KeyboardInterrupt: 

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import time
import re

BASE_URL = "https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw={year}{month:02d}"
DELAY = 1.5  # seconds between requests

# Standardized party columns
PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]

all_dfs = []

def extract_percent(value):
    """Extract percentage as float from strings like '30,1 (58)'"""
    if pd.isna(value):
        return None
    s = str(value)
    # Use regex to find number with comma or dot before optional space/parenthesis
    match = re.search(r'(\d+,\d+|\d+\.\d+)', s)
    if match:
        num = match.group(1).replace(",", ".")
        try:
            return float(num)
        except:
            return None
    return None

for year in range(2025, 2026):
    for month in range(1, 13):
        if year == 2025 and month > 9:
            break
        month_url = BASE_URL.format(year=year, month=month)
        try:
            resp = requests.get(month_url)
            if resp.status_code != 200:
                print(f"Failed to fetch {month_url}")
                continue

            soup = BeautifulSoup(resp.text, "html.parser")
            # Find the "Last ned målinger" link
            download_link = None
            for a in soup.find_all("a", href=True):
                if "Last ned målinger" in a.text or "Last ned data" in a.text:
                    download_link = a['href']
                    break

            if not download_link:
                print(f"No download link for {year}-{month:02d}")
                continue

            # Fix relative URLs
            if download_link.startswith("/"):
                download_link = "https://www.pollofpolls.no" + download_link
            elif not download_link.startswith("http"):
                download_link = "https://www.pollofpolls.no/" + download_link

            # Download CSV
            csv_resp = requests.get(download_link)
            if csv_resp.status_code != 200 or len(csv_resp.text) < 50:
                print(f"Failed to download CSV for {year}-{month:02d}")
                continue

            df = pd.read_csv(StringIO(csv_resp.text), delimiter=";")
            df_cols = df.columns.str.strip()
            df.columns = df_cols

            # Determine date column
            date_col = None
            for c in ["Dato", "Periode"]:
                if c in df.columns:
                    date_col = c
                    break

            if date_col:
                df['Dato'] = pd.to_datetime(df[date_col], dayfirst=True, errors='coerce')
            else:
                df['Dato'] = pd.NaT

            # Extract percentages for each party
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA  # add missing party columns

            # Keep only relevant columns
            keep_cols = ["Måling", "Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]

            # Add Year/Month
            df['Year'] = year
            df['Month'] = month

            all_dfs.append(df)
            print(f"Downloaded {year}-{month:02d} with {len(df)} polls")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

# Merge all months
if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("stortingsvalg_polls_clean.csv", index=False)
    print(f"Saved {len(full_df)} rows to stortingsvalg_polls_clean.csv")
else:
    print("No data downloaded.")


Downloaded 2025-01 with 9 polls
Downloaded 2025-02 with 10 polls
Downloaded 2025-03 with 8 polls
Downloaded 2025-04 with 8 polls
Downloaded 2025-05 with 9 polls
Downloaded 2025-06 with 8 polls
Downloaded 2025-07 with 4 polls
Downloaded 2025-08 with 15 polls
Downloaded 2025-09 with 10 polls
Saved 81 rows to stortingsvalg_polls_clean.csv
