In [24]:
import requests
import pandas as pd
from io import StringIO
import io
import re
import chardet
import time

# --- Configuration ---
PARTIES = ["Ap","Høyre","Frp","SV","Sp","KrF","Venstre","MDG","Rødt","Andre"]
BASE_CSV_URL = "https://www.pollofpolls.no/lastned.csv?tabell=liste_galluper&type=riks&start={start}&slutt={end}&kommuneid=0"
DELAY = 1.0  # seconds between requests
START_YEAR = 2025
END_YEAR = 2025
END_MONTH = 9  # stop at September 2025

# --- Helper functions ---
def preprocess_csv_text(text):
    """Fix numbers like '18 3 (33)' → '18,3 (33)'"""
    def repl(match):
        return match.group(1) + "," + match.group(2) + match.group(3)
    text = re.sub(r"(\d+)\s+(\d+)(\s*\(\d+\))", repl, text)
    return text

def extract_percent(value):
    """Extract float percentage from strings like '18,4 (36)'"""
    if pd.isna(value):
        return None
    s = str(value).replace(",", ".")
    match = re.search(r"\d+\.?\d*", s)
    if match:
        return float(match.group())
    return None

# --- Main loop ---
all_dfs = []

for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        if year == END_YEAR and month > END_MONTH:
            break
        start_date = f"{year}-{month:02d}-01"
        # Compute last day of month
        if month == 12:
            end_date = f"{year}-12-31"
        else:
            end_date = f"{year}-{month+1:02d}-01"
            # subtract one day
            end_date = pd.to_datetime(end_date) - pd.Timedelta(days=1)
            end_date = end_date.strftime("%Y-%m-%d")

        csv_url = BASE_CSV_URL.format(start=start_date, end=end_date)

        try:
            resp = requests.get(csv_url)
            if resp.status_code != 200 or len(resp.text) < 50:
                print(f"No CSV data for {year}-{month:02d}")
                continue

            #text = preprocess_csv_text(resp.text)

            # Read CSV (header on row 3 → skip 2)

            raw = resp.content   
            det = chardet.detect(raw)
            print(det)
            
            df = pd.read_csv(io.BytesIO(raw), delimiter=";", skiprows=2, encoding=det['encoding'], engine="python")
            df.columns = [c.strip() for c in df.columns]
            #print(df.columns)
            
            # Fix column names for Norwegian letters
            #df.columns = [c.encode('utf-8-sig', errors='replace').decode('utf-8-sig') for c in df.columns]
            #print(df.columns)
            # Parse date
            if "Dato" in df.columns:
                df['Dato'] = pd.to_datetime(df['Dato'], dayfirst=True, errors='coerce')
            else:
                df['Dato'] = pd.NaT

            # Extract percentages
            for col in PARTIES:
                if col in df.columns:
                    df[col] = df[col].apply(extract_percent)
                else:
                    df[col] = pd.NA

            # Keep relevant columns
            keep_cols = ["Måling", "Dato"] + PARTIES
            df = df[[c for c in keep_cols if c in df.columns]]

            # Add Year/Month columns
            df['Year'] = year
            df['Month'] = month

            all_dfs.append(df)
            print(f"Downloaded {year}-{month:02d}, {len(df)} polls")

        except Exception as e:
            print(f"Error for {year}-{month:02d}: {e}")

        time.sleep(DELAY)

# --- Combine all months ---
if all_dfs:
    full_df = pd.concat(all_dfs, ignore_index=True)
    full_df.to_csv("all_polls.csv", index=False, encoding=det['encoding'])
    print(f"Saved {len(full_df)} rows to all_polls.csv")
else:
    print("No data downloaded.")


{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Downloaded 2025-01, 8 polls
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Downloaded 2025-02, 9 polls
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Downloaded 2025-03, 7 polls
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Downloaded 2025-04, 7 polls
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Downloaded 2025-05, 8 polls
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
Downloaded 2025-06, 7 polls


KeyboardInterrupt: 

In [23]:
full_df

Unnamed: 0,Måling,Dato,Ap,Høyre,Frp,SV,Sp,KrF,Venstre,MDG,Rødt,Andre,Year,Month
0,Ipsos/Dagbladet,2008-01-30,28.0,18.8,20.6,10.1,8.0,5.5,6.0,0.0,2.4,0.0,2008,1
1,Opinion/ANB,2008-01-24,26.1,17.5,25.4,7.9,6.3,6.0,6.4,0.0,3.2,0.0,2008,1
2,Norstat/VL,2008-01-24,28.0,14.6,24.0,7.1,6.0,7.1,9.5,0.0,2.0,0.0,2008,1
3,InFact/VG,2008-01-21,28.4,20.6,24.7,5.6,6.1,6.7,5.2,0.0,1.6,0.0,2008,1
4,Respons/Aftenp,2008-01-21,30.7,18.3,24.2,7.1,5.4,6.2,5.4,0.0,1.1,0.0,2008,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1858,InFact/Nettavisen,2025-09-03,26.5,13.6,21.8,5.4,6.3,5.2,5.1,4.6,7.3,4.3,2025,9
1859,Opinion/ABC / AT,2025-09-03,26.7,13.4,22.4,5.2,6.5,3.7,4.6,7.0,6.7,4.0,2025,9
1860,Verian/TV2,2025-09-02,26.2,15.2,21.3,6.3,6.0,4.3,4.0,6.2,5.7,4.9,2025,9
1861,Respons/VG / Aftenp. / BT,2025-09-02,26.6,14.8,20.3,6.2,4.8,4.6,4.6,7.1,6.6,4.5,2025,9
