In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import time
import re

BASE = "https://www.pollofpolls.no/"
ALL_MONTHS = BASE + "?cmd=Stortinget&do=visallesnitt"
DELAY = 1.5   # politeness delay

party_map = {
    "Ap": "Ap",
    "H": "Høyre",
    "Frp": "Frp",
    "SV": "SV",
    "Sp": "Sp",
    "KrF": "KrF",
    "V": "Venstre",
    "MDG": "MDG",
    "R": "Rødt",
    "Andre": "Andre"
}

def get_soup(url):
    r = requests.get(url)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser"), r.text

def collect_month_links():
    soup, _ = get_soup(ALL_MONTHS)
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "cmd=Stortinget&do=snitt&yw=" in href:
            links.append(urljoin(BASE, href))
    return sorted(set(links))

def collect_poll_links_for_month(month_url):
    soup, _ = get_soup(month_url)
    poll_links = []

    # Normal institute sub-links (modern)
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if (
            "cmd=Stortinget&do=maling&pid=" in href
            or "cmd=Stortinget&do=snitt&dato=" in href
        ):
            poll_links.append(urljoin(BASE, href))

    # If no sub-links → treat the month page itself as a "poll" (monthly average)
    if not poll_links:
        poll_links = [month_url]

    return poll_links

def parse_poll_page(poll_url):
    soup, html = get_soup(poll_url)
    data = {"url": poll_url}

    # Heading with institute (if present)
    h1 = soup.find("h1")
    if h1:
        data["institute"] = h1.get_text(strip=True)
    else:
        data["institute"] = "Monthly average"

    # Default values
    data["respondents"] = None
    data["fieldwork"] = None
    data["date"] = None

    # Metadata text
    meta_text = soup.get_text(" ", strip=True)

    respondents = re.search(r"Antall spurte:? (\d+)", meta_text)
    if respondents:
        data["respondents"] = int(respondents.group(1))

    fieldwork = re.search(r"Feltperiode:? ([0-9.\-– ]+)", meta_text)
    if fieldwork:
        data["fieldwork"] = fieldwork.group(1).strip()

    # Poll date
    for tag in soup.find_all(["h2", "p"]):
        txt = tag.get_text(" ", strip=True)
        if re.search(r"\d{1,2}\.\s*[A-Za-zæøåÆØÅ]+ \d{4}", txt):
            data["date"] = txt
            break

    # Party table
    for table in soup.find_all("table"):
        headers = [th.get_text(strip=True) for th in table.find_all("th")]
        if any(h in headers for h in party_map.keys()):
            rows = table.find_all("tr")
            for row in rows[1:]:
                cols = [c.get_text(strip=True) for c in row.find_all(["td", "th"])]
                if len(cols) >= 2 and cols[0] in party_map:
                    party = party_map[cols[0]]
                    try:
                        data[party] = float(cols[1].replace(",", "."))
                    except:
                        data[party] = None
            break

    return data

def main():
    month_links = collect_month_links()
    print(f"Found {len(month_links)} months")

    all_polls = []
    for i, mlink in enumerate(month_links, 1):
        poll_links = collect_poll_links_for_month(mlink)
        print(f"[{i}/{len(month_links)}] {mlink} → {len(poll_links)} polls")

        # Debug: show first 3 poll URLs for the first 2 months
        if i <= 2:
            print("  Sample poll links:", poll_links[:3])

        for plink in poll_links:
            try:
                poll = parse_poll_page(plink)
                if poll:
                    all_polls.append(poll)
            except Exception as e:
                print("Error parsing", plink, e)
            time.sleep(DELAY)

    df = pd.DataFrame(all_polls)
    df.to_csv("stortingsvalg_polls_pollofpolls_full.csv", index=False, encoding="utf-8")
    print(f"Saved {len(df)} polls to stortingsvalg_polls_pollofpolls_full.csv")

if __name__ == "__main__":
    main()


Found 213 months
[1/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200801 → 1 polls
  Sample poll links: ['https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200801']
[2/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200802 → 1 polls
  Sample poll links: ['https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200802']
[3/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200803 → 1 polls
[4/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200804 → 1 polls
[5/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200805 → 1 polls
[6/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200806 → 1 polls
[7/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200807 → 1 polls
[8/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200808 → 1 polls
[9/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200809 → 1 polls
[10/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200810 → 1 polls
[11/213]

KeyboardInterrupt: 

In [9]:
#pip install selenium pandas

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
    --------------------------------------- 0.2/9.6 MB 3.5 MB/s eta 0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.32.2 requires rich<14,>=10.14.0, but you have rich 14.1.0 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

BASE = "https://www.pollofpolls.no/"
START_URL = BASE + "?cmd=Stortinget&do=visallesnitt"
DELAY = 1.5

PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def parse_poll_page(driver, url):
    driver.get(url)
    time.sleep(DELAY)
    data = {"url": url}

    # Heading
    try:
        h1 = driver.find_element(By.TAG_NAME,"h1")
        data["institute"] = h1.text
    except:
        data["institute"] = "Unknown"

    # Metadata
    body_text = driver.find_element(By.TAG_NAME,"body").text

    m = re.search(r"Antall spurte[:\s]*([0-9 ]+)", body_text)
    data["respondents"] = int(m.group(1).replace(" ","")) if m else None

    m = re.search(r"Feltperiode[:\s]*([0-9.\-– ]+)", body_text)
    data["fieldwork"] = m.group(1).strip() if m else None

    m = re.search(r"(\d{1,2}\.\s*[A-Za-zæøåÆØÅ]+ \d{4})", body_text)
    data["date"] = m.group(1) if m else None

    # Party table
    table_rows = driver.find_elements(By.XPATH,"//table//tr")
    for row in table_rows:
        cols = row.find_elements(By.XPATH,"./td|./th")
        if len(cols) >= 2:
            party_name = cols[0].text.strip()
            if party_name in PARTIES:
                try:
                    data[party_name] = float(cols[1].text.strip().replace(",","."))
                except:
                    data[party_name] = None
    return data

def main():
    driver = setup_driver()
    driver.get(START_URL)
    time.sleep(DELAY)

    # Collect all month links
    month_links = []
    for a in driver.find_elements(By.XPATH,"//a[contains(@href,'do=snitt&yw=')]"):
        month_links.append(a.get_attribute("href"))
    month_links = sorted(list(set(month_links)))
    print(f"Found {len(month_links)} months")

    all_polls = []

    for i, month_url in enumerate(month_links,1):
        driver.get(month_url)
        time.sleep(DELAY)

        # Collect poll links in this month
        poll_links = []
        for a in driver.find_elements(By.XPATH,"//a[contains(@href,'do=maling') or contains(@href,'do=snitt&dato')]"):
            poll_links.append(a.get_attribute("href"))

        print(f"[{i}/{len(month_links)}] {month_url} → {len(poll_links)} polls")
        if i <= 3:
            print("  Sample poll links:", poll_links[:5])

        for plink in poll_links:
            try:
                poll_data = parse_poll_page(driver, plink)
                all_polls.append(poll_data)
            except Exception as e:
                print("Error parsing", plink, e)

    driver.quit()
    df = pd.DataFrame(all_polls)
    df.to_csv("stortingsvalg_polls_pollofpolls_full.csv", index=False, encoding="utf-8")
    print(f"Saved {len(df)} polls to CSV")

if __name__=="__main__":
    main()


Found 213 months
[1/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200801 → 0 polls
  Sample poll links: []
[2/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200802 → 0 polls
  Sample poll links: []
[3/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200803 → 0 polls
  Sample poll links: []
[4/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200804 → 0 polls
[5/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200805 → 0 polls
[6/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200806 → 0 polls
[7/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200807 → 0 polls
[8/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200808 → 0 polls
[9/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200809 → 0 polls
[10/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200810 → 0 polls
[11/213] https://www.pollofpolls.no/?cmd=Stortinget&do=snitt&yw=200811 → 0 polls
[12/213] https://www.pollofpo

KeyboardInterrupt: 

In [14]:
import requests
import pandas as pd
from io import StringIO
import time
import re

BASE_CSV_URL = "https://www.pollofpolls.no/export?cmd=Stortinget&do=snitt&yw={year}{month:02d}&format=csv"
DELAY = 1.0  # seconds between requests
PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]

all_dfs = []

def extract_percent(value):
    if pd.isna(value):
        return None
    s = str(value).strip()
    if not s:
        return None
    match = re.match(r'^(\d+(?:,\d+)?)', s)
    if match:
        return float(match.group(1).replace(",", "."))
    return None

def normalize_columns(cols):
    mapping = {"H�yre":"Høyre","R�dt":"Rødt","M�ling":"Måling","Andre":"Andre"}
    return [mapping.get(c.strip(), c.strip()) for c in cols]

for year in range(2025, 2026):
    for month in range(1, 13):
        if year == 2025 and month > 9:
            break
        csv_url = BASE_CSV_URL.format(year=year, month=month)
        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            df = pd.read_csv(StringIO(resp.text), delimiter=";", encoding="latin-1")
            df.columns = normalize_columns(df.columns)

            # Parse date
            date_col = next((c for c in ["Dato","Periode"] if c in df.columns), None)
            if date_col:
                df["Dato"] = pd.to_datetime(df[date_col], dayfirst=True, errors="coerce")
            else:
                df["Dato"] = pd.NaT

            # Extract percentages
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA

            # Keep relevant columns
            keep_cols = ["Måling","Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]
            df["Year"] = year
            df["Month"] = month

            # Drop rows where all party percentages are NaN
            df = df.dropna(subset=PARTIES, how="all")
            if len(df) == 0:
                print(f"No valid rows in CSV for {year}-{month:02d}")
                continue

            all_dfs.append(df)
            print(f"Collected {len(df)} polls for {year}-{month:02d}")
        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")
        time.sleep(DELAY)

if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    # Remove stray unnamed columns
    full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
    print("\n=== Preview ===")
    print(full_df.head(15))
    full_df.to_csv("stortingsvalg_polls_clean.csv", index=False)
    print(f"\nSaved {len(full_df)} rows to stortingsvalg_polls_clean.csv")
else:
    print("No data collected.")


No CSV data for 2025-01
No CSV data for 2025-02
No CSV data for 2025-03
No CSV data for 2025-04
No CSV data for 2025-05
No CSV data for 2025-06
No CSV data for 2025-07
No CSV data for 2025-08
No CSV data for 2025-09
No data collected.


In [30]:
import requests
import pandas as pd
from io import StringIO
import time
import calendar

DELAY = 1.0  # seconds
START_YEAR, START_MONTH = 2025, 1
END_YEAR, END_MONTH = 2025, 9

all_dfs = []

# Function to compute last day of month
def last_day_of_month(year, month):
    return calendar.monthrange(year, month)[1]

# Loop through each month
for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        if year == END_YEAR and month > END_MONTH:
            break

        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day_of_month(year, month):02d}"

        csv_url = f"https://www.pollofpolls.no/lastned.csv?tabell=liste_galluper&type=riks&start={start_date}&slutt={end_date}&kommuneid=0"

        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            # Skip the first two rows to get the correct header
            df = pd.read_csv(StringIO(resp.text), delimiter=";", skiprows=2, encoding="latin-1")
            df["Year"] = year
            df["Month"] = month
            all_dfs.append(df)
            print(f"Collected {len(df)} polls for {year}-{month:02d}")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

# Combine all months into one CSV
if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("stortingsvalg_all_polls.csv", index=False)
    print(f"\nSaved {len(full_df)} rows to stortingsvalg_all_polls.csv")
else:
    print("No data collected.")


Collected 8 polls for 2025-01
Collected 9 polls for 2025-02
Collected 7 polls for 2025-03
Collected 7 polls for 2025-04
Collected 8 polls for 2025-05
Collected 7 polls for 2025-06
Collected 3 polls for 2025-07
Collected 14 polls for 2025-08
Collected 13 polls for 2025-09

Saved 76 rows to stortingsvalg_all_polls.csv


In [34]:
import requests
import pandas as pd
from io import StringIO
import time
import calendar
import re

DELAY = 1.0  # seconds
START_YEAR, START_MONTH = 2025, 1
END_YEAR, END_MONTH = 2025, 9

PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]
all_dfs = []

def last_day_of_month(year, month):
    return calendar.monthrange(year, month)[1]

def extract_percent(value):
    """Extract percentage as float from strings like '29,5 (55)' or '28 4 (50)'"""
    if pd.isna(value):
        return None
    s = str(value)
    # Replace comma with dot
    s = s.replace(",", ".")
    # Extract first float number in string
    match = re.search(r"\d+\.?\d*", s)
    if match:
        return float(match.group())
    return None

# Loop through each month
for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        if year == END_YEAR and month > END_MONTH:
            break

        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day_of_month(year, month):02d}"

        csv_url = f"https://www.pollofpolls.no/lastned.csv?tabell=liste_galluper&type=riks&start={start_date}&slutt={end_date}&kommuneid=0"

        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            # Skip first 2 rows to get the header
            df = pd.read_csv(StringIO(resp.text), delimiter=";", skiprows=2, encoding="utf-8-sig")

            # Clean percentages for each party
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA  # if column is missing, fill with NA

            # Keep only relevant columns
            keep_cols = ["Måling", "Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]

            # Add Year/Month for filtering
            df["Year"] = year
            df["Month"] = month

            all_dfs.append(df)
            #print(df)
            print(f"Collected and cleaned {len(df)} polls for {year}-{month:02d}")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

# Combine all months
if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("stortingsvalg_all_polls_clean.csv", index=False)
    print(f"\nSaved {len(full_df)} rows to stortingsvalg_all_polls_clean.csv")
else:
    print("No data collected.")


        Dato    Ap Høyre   Frp   SV   Sp  KrF  Venstre  MDG  Rødt  Andre  \
0  31/1-2025  18.4  <NA>  26.1  9.1  8.2  3.0      4.7  3.4  <NA>    4.3   
1  22/1-2025  18.8  <NA>  26.2  8.3  5.5  3.5      2.8  3.1  <NA>    5.1   
2  15/1-2025  20.2  <NA>  23.7  8.7  4.9  3.9      5.8  3.3  <NA>    3.8   
3  14/1-2025  16.7  <NA>  24.5  7.7  6.1  3.1      5.9  3.8  <NA>    3.0   
4  13/1-2025  21.0  <NA>  24.3  6.9  5.6  3.9      3.4  3.6  <NA>    2.1   
5  11/1-2025  18.4  <NA>  21.1  8.1  6.6  3.2      4.3  3.2  <NA>    5.9   
6   9/1-2025  18.1  <NA>  25.0  8.7  6.3  3.2      4.7  2.8  <NA>    3.9   
7   7/1-2025  20.6  <NA>  24.2  7.5  6.5  3.2      4.5  3.0  <NA>    3.8   

   Year  Month  
0  2025      1  
1  2025      1  
2  2025      1  
3  2025      1  
4  2025      1  
5  2025      1  
6  2025      1  
7  2025      1  
Collected and cleaned 8 polls for 2025-01
        Dato    Ap Høyre   Frp   SV   Sp  KrF  Venstre  MDG  Rødt  Andre  \
0  19/2-2025  26.6  <NA>  22.4  7.2  6.0  3.

KeyboardInterrupt: 

In [38]:
import requests
import pandas as pd
from io import StringIO
import time
import calendar
import re

# Configuration
DELAY = 1.0  # seconds between requests
START_YEAR, START_MONTH = 2025, 1
END_YEAR, END_MONTH = 2025, 9

# Standardized party columns
PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]

# Helper functions
def last_day_of_month(year, month):
    return calendar.monthrange(year, month)[1]

def extract_percent(value):
    """Extract percentage as float from strings like '29,5 (55)' or '28 4 (50)'"""
    if pd.isna(value):
        return None
    s = str(value).replace(",", ".")
    match = re.search(r"\d+\.?\d*", s)
    if match:
        return float(match.group())
    return None

all_dfs = []

# Loop through each month
for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        if year == END_YEAR and month > END_MONTH:
            break

        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day_of_month(year, month):02d}"

        csv_url = (
            f"https://www.pollofpolls.no/lastned.csv?"
            f"tabell=liste_galluper&type=riks&start={start_date}&slutt={end_date}&kommuneid=0"
        )

        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            # Read CSV skipping first 2 rows (metadata)
            df = pd.read_csv(StringIO(resp.text), delimiter=";", skiprows=2, encoding="utf-8-sig")
            # Strip column names
            df.columns = [c.strip() for c in df.columns]
            #print(df.columns)
            # Clean party percentages
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA

            # Keep only relevant columns
            keep_cols = ["Måling", "Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]
            print(df)

            # Add Year/Month
            df["Year"] = year
            df["Month"] = month

            all_dfs.append(df)
            print(f"Collected and cleaned {len(df)} polls for {year}-{month:02d}")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

# Combine all months
if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("stortingsvalg_new.csv", index=False)
    print(f"\nSaved {len(full_df)} rows to stortingsvalg_all_polls_clean.csv")
else:
    print("No data collected.")


        Dato    Ap Høyre   Frp   SV   Sp  KrF  Venstre  MDG  Rødt  Andre
0  31/1-2025  18.4  <NA>  26.1  9.1  8.2  3.0      4.7  3.4  <NA>    4.3
1  22/1-2025  18.8  <NA>  26.2  8.3  5.5  3.5      2.8  3.1  <NA>    5.1
2  15/1-2025  20.2  <NA>  23.7  8.7  4.9  3.9      5.8  3.3  <NA>    3.8
3  14/1-2025  16.7  <NA>  24.5  7.7  6.1  3.1      5.9  3.8  <NA>    3.0
4  13/1-2025  21.0  <NA>  24.3  6.9  5.6  3.9      3.4  3.6  <NA>    2.1
5  11/1-2025  18.4  <NA>  21.1  8.1  6.6  3.2      4.3  3.2  <NA>    5.9
6   9/1-2025  18.1  <NA>  25.0  8.7  6.3  3.2      4.7  2.8  <NA>    3.9
7   7/1-2025  20.6  <NA>  24.2  7.5  6.5  3.2      4.5  3.0  <NA>    3.8
Collected and cleaned 8 polls for 2025-01
        Dato    Ap Høyre   Frp   SV   Sp  KrF  Venstre  MDG  Rødt  Andre
0  19/2-2025  26.6  <NA>  22.4  7.2  6.0  3.5      3.9  2.1  <NA>    3.7
1  11/2-2025  28.7  <NA>  25.3  7.4  6.0  2.1      3.1  2.7  <NA>    3.2
2   8/2-2025  24.7  <NA>  24.7  7.6  6.1  3.0      4.6  2.7  <NA>    2.8
3   7/2-2

In [43]:
import requests
import pandas as pd
from io import StringIO
import re
import time
import calendar

PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]
DELAY = 1.0
START_YEAR, START_MONTH = 2025, 1
END_YEAR, END_MONTH = 2025, 9

def last_day_of_month(year, month):
    return calendar.monthrange(year, month)[1]

def preprocess_csv_text(text):
    """Fix spaces in numbers like '18 3 (33)' → '18,3 (33)'"""
    def repl(match):
        return match.group(1) + "," + match.group(2) + match.group(3)
    # Matches digits space digits, keeps the following "(xx)"
    text = re.sub(r"(\d+)\s+(\d+)(\s*\(\d+\))", repl, text)
    return text

def extract_percent(value):
    if pd.isna(value):
        return None
    s = str(value).replace(",", ".")
    match = re.search(r"\d+\.?\d*", s)
    if match:
        return float(match.group())
    return None

all_dfs = []

for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        if year == END_YEAR and month > END_MONTH:
            break

        start_date = f"{year}-{month:02d}-01"
        end_date = f"{year}-{month:02d}-{last_day_of_month(year, month):02d}"
        csv_url = f"https://www.pollofpolls.no/lastned.csv?tabell=liste_galluper&type=riks&start={start_date}&slutt={end_date}&kommuneid=0"

        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            text = preprocess_csv_text(resp.text)

            # Skip first 2 rows (metadata)
            df = pd.read_csv(StringIO(text), delimiter=";", skiprows=2, encoding="utf-8-sig")
            df.columns = [c.strip() for c in df.columns]

            print(df)
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA

            keep_cols = ["Måling", "Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]
            print(df)
                    
            df["Year"] = year
            df["Month"] = month

            all_dfs.append(df)
            print(f"Processed {len(df)} polls for {year}-{month:02d}")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("new3.csv", index=False)
    print(f"Saved {len(full_df)} rows to stortingsvalg_all_polls_clean.csv")
else:
    print("No data collected.")


                      M�ling       Dato         Ap      H�yre        Frp  \
0          InFact/Nettavisen  31/1-2025  18,4 (36)  18,3 (33)  26,1 (47)   
1      Opinion/DA / FF / ANB  22/1-2025  18,8 (37)  21,8 (41)  26,2 (49)   
2                Norstat/NRK  15/1-2025  20,2 (38)  19,8 (35)  23,7 (45)   
3  Respons/VG / Aftenp. / BT  14/1-2025  16,7 (32)  24,0 (44)  24,5 (45)   
4                 Verian/TV2  13/1-2025  21,0 (42)  23,8 (42)  24,3 (46)   
5         Norfakta/Nat. / KK  11/1-2025  18,4 (36)  23,2 (43)  21,1 (40)   
6           Opinion/ABC / AT   9/1-2025  18,1 (33)  21,6 (39)  25,0 (47)   
7          InFact/Nettavisen   7/1-2025  20,6 (38)  21,0 (39)  24,2 (44)   

         SV        Sp      KrF   Venstre      MDG      R�dt    Andre  
0  9,1 (16)  8,2 (15)  3,0 (2)   4,7 (9)  3,4 (3)   4,5 (8)  4,3 (0)  
1  8,3 (16)  5,5 (10)  3,5 (3)   2,8 (2)  3,1 (2)   5,0 (9)  5,1 (0)  
2  8,7 (16)   4,9 (9)  3,9 (3)  5,8 (10)  3,3 (2)  5,9 (11)  3,8 (0)  
3  7,7 (14)  6,1 (11)  3,1 (1) 

KeyboardInterrupt: 

In [44]:
import requests
import pandas as pd
from io import StringIO
import re
import time

# --- Configuration ---
PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]
BASE_CSV_URL = "https://www.pollofpolls.no/lastned.csv?tabell=liste_galluper&type=riks&start={start}&slutt={end}&kommuneid=0"
DELAY = 1.0  # seconds between requests
START_YEAR = 2025
END_YEAR = 2025
END_MONTH = 9  # stop at September 2025

# --- Helper functions ---
def preprocess_csv_text(text):
    """Fix numbers like '18 3 (33)' → '18,3 (33)'"""
    def repl(match):
        return match.group(1) + "," + match.group(2) + match.group(3)
    text = re.sub(r"(\d+)\s+(\d+)(\s*\(\d+\))", repl, text)
    return text

def extract_percent(value):
    """Extract float percentage from strings like '18,4 (36)'"""
    if pd.isna(value):
        return None
    s = str(value).replace(",", ".")
    match = re.search(r"\d+\.?\d*", s)
    if match:
        return float(match.group())
    return None

# --- Main loop ---
all_dfs = []

for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        if year == END_YEAR and month > END_MONTH:
            break
        start_date = f"{year}-{month:02d}-01"
        # Compute last day of month
        if month == 12:
            end_date = f"{year}-12-31"
        else:
            end_date = f"{year}-{month+1:02d}-01"
            # subtract one day
            end_date = pd.to_datetime(end_date) - pd.Timedelta(days=1)
            end_date = end_date.strftime("%Y-%m-%d")

        csv_url = BASE_CSV_URL.format(start=start_date, end=end_date)

        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            text = preprocess_csv_text(resp.text)

            # Read CSV (header on row 3 → skip 2)
            df = pd.read_csv(StringIO(text), delimiter=";", skiprows=2, encoding="utf-8", engine="python")
            df.columns = [c.strip() for c in df.columns]

            # Fix column names for Norwegian letters
            df.columns = [c.encode('utf-8', errors='replace').decode('utf-8') for c in df.columns]

            # Parse date
            if "Dato" in df.columns:
                df['Dato'] = pd.to_datetime(df['Dato'], dayfirst=True, errors='coerce')
            else:
                df['Dato'] = pd.NaT

            # Extract percentages
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA

            # Keep relevant columns
            keep_cols = ["Måling", "Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]

            # Add Year/Month columns
            df['Year'] = year
            df['Month'] = month

            all_dfs.append(df)
            print(f"Downloaded {year}-{month:02d}, {len(df)} polls")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

# --- Combine all months ---
if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("stortingsvalg_polls_clean.csv", index=False)
    print(f"Saved {len(full_df)} rows to stortingsvalg_polls_clean.csv")
else:
    print("No data downloaded.")


Downloaded 2025-01, 8 polls
Downloaded 2025-02, 9 polls
Downloaded 2025-03, 7 polls
Downloaded 2025-04, 7 polls
Downloaded 2025-05, 8 polls
Downloaded 2025-06, 7 polls
Downloaded 2025-07, 3 polls
Downloaded 2025-08, 14 polls
Downloaded 2025-09, 13 polls
Saved 76 rows to stortingsvalg_polls_clean.csv
