In [64]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import traceback
import pandas as pd

# configuration
dfs = []
candidateNumber = 72484
candidateEnd = 73000

# helper JS to query inside shadow roots too
QUERY_DEEP_FN = '''
function queryDeep(selector) {
  function search(root) {
    if (!root) return null;
    try {
      var el = root.querySelector(selector);
      if (el) return el;
    } catch (e) {}
    var children = root.querySelectorAll('*');
    for (var i = 0; i < children.length; i++) {
      var c = children[i];
      if (c.shadowRoot) {
        var found = search(c.shadowRoot);
        if (found) return found;
      }
    }
    return null;
  }
  return search(document);
}

function queryAllDeep(selector) {
  var results = [];
  function collect(root) {
    if (!root) return;
    try {
      var n = root.querySelectorAll(selector);
      for (var i = 0; i < n.length; i++) results.push(n[i]);
    } catch (e) {}
    var children = root.querySelectorAll('*');
    for (var i = 0; i < children.length; i++) {
      var c = children[i];
      if (c.shadowRoot) collect(c.shadowRoot);
    }
  }
  collect(document);
  return results;
}
'''

# start one browser instance and reuse it across candidates
options = Options()
# options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")

driver = None
try:
    driver = webdriver.Chrome(options=options)
    print('Started Chrome via Selenium Manager')
except Exception as e:
    print('Selenium Manager failed, falling back to webdriver-manager:', e)
    driver_path = ChromeDriverManager().install()
    driver = webdriver.Chrome(service=Service(driver_path), options=options)

# keep track of processed candidate numbers to avoid duplicates
processed = set()
previous_name = None

# main loop: navigate the single window to each candidate
while candidateNumber < candidateEnd:
    # if already processed (defensive), skip
    if candidateNumber in processed:
        candidateNumber += 1
        continue

    url = f"https://www.smartvote.ch/en/elections/26_st_winterthur_leg/candidacies/{candidateNumber}/answers"
    try:
        driver.get(url)

        wait = WebDriverWait(driver, 8)
        # wait for the page's main title to appear; if it doesn't, treat as no-answer/skip later
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1')))
        except Exception:
            pass

        # ensure we read a fresh name (wait up to a short time for the title to change from previous)
        name = None
        try:
            def _name_changed(d):
                try:
                    t = d.find_element(By.CSS_SELECTOR, 'h1').text.strip()
                    return t and (previous_name is None or t != previous_name)
                except Exception:
                    return False
            try:
                WebDriverWait(driver, 3).until(_name_changed)
            except Exception:
                # not critical
                pass
            # now read the h1 (if present)
            try:
                name_el = driver.find_element(By.CSS_SELECTOR, 'h1')
                name = name_el.text.strip()
            except Exception:
                name = None
        except Exception:
            name = None

        # quick check: are there any answer/question containers on the page? if not, skip candidate
        answer_indicators = driver.find_elements(By.CSS_SELECTOR, 'details.content-group, .content-group, .question-options, .question-options__value, [data-collapsible-section-id], .candidate-answers')
        if not answer_indicators:
            print(f"Candidate {candidateNumber}: no answers found, skipping")
            # update previous_name so wait logic won't hang on same title next candidate
            previous_name = name
            candidateNumber += 1
            continue

        # --- party extraction (explicit) ---
        party = None
        try:
            # prefer the object-heading__details span which contains 'Party | code'
            party_el = driver.find_element(By.CSS_SELECTOR, '.object-heading__details')
            if party_el and party_el.text:
                raw = party_el.text.strip()
                # split by '|' or newline and take left part
                party = raw.split('|')[0].strip() if '|' in raw else raw.split('\n')[0].strip()
        except Exception:
            # fallback to deep JS search
            try:
                party_js = QUERY_DEEP_FN + "\nvar el = queryDeep('.object-heading__details') || queryDeep('.candidate-header__party') || queryDeep('.party'); return el ? el.innerText.trim() : null;"
                ptxt = driver.execute_script(party_js)
                if ptxt:
                    party = ptxt.split('|')[0].strip() if '|' in ptxt else ptxt.split('\n')[0].strip()
            except Exception:
                party = None

        # --- expand all answers ---
        try:
            driver.execute_script(QUERY_DEEP_FN + "\nvar bs = queryAllDeep('button[data-collapsible-section-trigger]'); for (var i=0;i<bs.length;i++){ try{ bs[i].click(); }catch(e){} } ");
        except Exception:
            try:
                buttons = driver.find_elements(By.CSS_SELECTOR, "button[data-collapsible-section-trigger]")
                for b in buttons:
                    try:
                        driver.execute_script("arguments[0].click();", b)
                    except Exception:
                        pass
            except Exception:
                pass
        time.sleep(0.25)

        # --- collect question containers robustly and dedupe ---
        seen_html = set()
        containers = []

        container_selectors = ['details.content-group', '.content-group', '.question', '.question-block', '.qa-block', '.question-row', '.question-item']
        for sel in container_selectors:
            try:
                els = driver.find_elements(By.CSS_SELECTOR, sel)
                for e in els:
                    try:
                        outer = driver.execute_script('return arguments[0].outerHTML.slice(0,1200);', e) or ''
                        if outer and outer not in seen_html:
                            seen_html.add(outer)
                            containers.append(e)
                    except Exception:
                        continue
            except Exception:
                continue

        # also try mapping headings to nearby ancestor
        h_elems = driver.find_elements(By.CSS_SELECTOR, 'h3, h2, .question-title')
        for h in h_elems:
            try:
                el = h
                ancestor = None
                for _ in range(6):
                    try:
                        parent = el.find_element(By.XPATH, '..')
                    except Exception:
                        parent = None
                    if not parent:
                        break
                    try:
                        has_controls = parent.find_elements(By.CSS_SELECTOR, 'table, input, details, [data-collapsible-section-id], .question-options, .question-options__value')
                    except Exception:
                        has_controls = []
                    try:
                        cls = parent.get_attribute('class') or ''
                    except Exception:
                        cls = ''
                    if has_controls or 'question' in cls or 'content-group' in cls:
                        ancestor = parent
                        break
                    el = parent
                if ancestor is not None:
                    outer = driver.execute_script('return arguments[0].outerHTML.slice(0,1200);', ancestor) or ''
                    if outer and outer not in seen_html:
                        seen_html.add(outer)
                        containers.append(ancestor)
            except Exception:
                continue

        # fallback
        if not containers:
            try:
                containers = driver.find_elements(By.CSS_SELECTOR, 'details.content-group')
            except Exception:
                containers = []

        # dedupe
        unique_containers = []
        seen = set()
        for c in containers:
            try:
                o = driver.execute_script('return arguments[0].outerHTML.slice(0,1200);', c) or ''
                if o and o not in seen:
                    seen.add(o)
                    unique_containers.append(c)
            except Exception:
                continue

        questions = unique_containers

        # --- extract Q/A ---
        answers = []
        seen_questions = set()
        for idx, q in enumerate(questions):
            try:
                try:
                    driver.execute_script('arguments[0].scrollIntoView({block: "center"});', q)
                    driver.execute_script('try{ arguments[0].open = true; }catch(e){};', q)
                except Exception:
                    pass
                time.sleep(0.06)

                # question text
                question_text = None
                try:
                    qh = q.find_element(By.CSS_SELECTOR, 'h3')
                    question_text = qh.text.strip()
                except Exception:
                    try:
                        qh = q.find_element(By.CSS_SELECTOR, '.question-title, h2')
                        question_text = qh.text.strip()
                    except Exception:
                        try:
                            qh = q.find_element(By.XPATH, './preceding-sibling::*[self::h3 or self::h2][1]')
                            question_text = qh.text.strip()
                        except Exception:
                            question_text = None

                if not question_text:
                    try:
                        inner = driver.execute_script('return arguments[0].innerText;', q) or ''
                        first = inner.strip().split('\n')[0]
                        question_text = first[:120].strip() if first else f'question_{idx+1}'
                    except Exception:
                        question_text = f'question_{idx+1}'

                qkey = question_text.lower()
                if qkey in seen_questions:
                    continue

                answer_text = None

                # explicit option table extraction
                try:
                    tables = q.find_elements(By.CSS_SELECTOR, 'table.question-options, .question-options')
                    if tables:
                        for t in tables:
                            try:
                                header_ths = t.find_elements(By.CSS_SELECTOR, 'thead th')
                                if not header_ths:
                                    header_ths = t.find_elements(By.CSS_SELECTOR, 'tr:first-child th, th.question-options__label')
                                headers = [th.text.strip() for th in header_ths if th.text and th.text.strip()]

                                sel_tds = t.find_elements(By.CSS_SELECTOR, 'td.question-options__value--selected, td[class*="value--selected"], td[style*="--percentage: 100%"]')
                                if sel_tds:
                                    st = sel_tds[0]
                                    try:
                                        row = st.find_element(By.XPATH, './ancestor::tr[1]')
                                        all_tds = row.find_elements(By.CSS_SELECTOR, 'td')
                                        sel_idx = None
                                        souter = driver.execute_script('return arguments[0].outerHTML;', st)
                                        for i, td in enumerate(all_tds):
                                            o = driver.execute_script('return arguments[0].outerHTML;', td)
                                            if o == souter:
                                                sel_idx = i
                                                break
                                    except Exception:
                                        sel_idx = None

                                    label = None
                                    if headers and sel_idx is not None and sel_idx < len(headers):
                                        label = headers[sel_idx]

                                    data_val = st.get_attribute('data-value')
                                    aria = st.get_attribute('aria-label')
                                    if label:
                                        answer_text = f"{label} (data-value={data_val})" if data_val else label
                                    elif aria:
                                        answer_text = aria
                                    elif data_val:
                                        answer_text = f"value:{data_val}"
                                    else:
                                        answer_text = (st.text or '').strip()

                                    if answer_text:
                                        pass
                                    break
                            except Exception:
                                continue
                except Exception:
                    pass

                # fallbacks
                if not answer_text:
                    try:
                        ans_el = q.find_element(By.CSS_SELECTOR, '[data-collapsible-section-id]')
                        if ans_el and ans_el.text and ans_el.text.strip():
                            answer_text = ans_el.text.strip()
                    except Exception:
                        pass

                if not answer_text:
                    try:
                        sel = q.find_element(By.CSS_SELECTOR, "*[class*='value--selected'], *[class*='selected'], *[class*='option--selected']")
                        if sel and sel.text and sel.text.strip():
                            answer_text = sel.text.strip()
                    except Exception:
                        pass

                if not answer_text:
                    try:
                        checked = q.find_elements(By.CSS_SELECTOR, "input[type='radio']:checked, input[type='checkbox']:checked")
                        if checked:
                            texts = []
                            for c in checked:
                                try:
                                    idv = c.get_attribute('id')
                                    if idv:
                                        lab = q.find_elements(By.CSS_SELECTOR, "label[for='" + idv + "']")
                                        if lab and lab[0].text.strip():
                                            texts.append(lab[0].text.strip())
                                            continue
                                    parent_label = c.find_element(By.XPATH, 'ancestor::label')
                                    if parent_label and parent_label.text.strip():
                                        texts.append(parent_label.text.strip())
                                        continue
                                except Exception:
                                    try:
                                        v = c.get_attribute('value')
                                        if v:
                                            texts.append(v)
                                    except Exception:
                                        pass
                            if texts:
                                answer_text = '; '.join([t for t in texts if t])
                    except Exception:
                        pass

                if not answer_text:
                    try:
                        el = q.find_element(By.CSS_SELECTOR, "*[aria-pressed='true'], *[aria-selected='true']")
                        if el and el.text and el.text.strip():
                            answer_text = el.text.strip()
                    except Exception:
                        pass

                if not answer_text:
                    try:
                        txt = driver.execute_script('return arguments[0].innerText;', q) or ''
                        if txt:
                            if question_text and question_text in txt:
                                txt = txt.replace(question_text, '')
                            txt = txt.strip()
                            if txt:
                                answer_text = txt
                    except Exception:
                        pass

                if not answer_text:
                    answer_text = ''

                answers.append({'question': question_text, 'answer': answer_text})
                seen_questions.add(qkey)
            except Exception:
                continue

        # aggregate into dict, keep only non-empty answers
        qa_dict = {item['question']: item['answer'] for item in answers if item['answer']}

        row = {
            'Name': name,
            'Party': party,
            **qa_dict
        }

        df = pd.DataFrame([row])
        dfs.append(df)
        print(f"Processed candidate {candidateNumber}: {name}, {party}, {len(answers)} QAs")

        # mark processed
        processed.add(candidateNumber)
        previous_name = name

    except Exception as e:
        print(f"Error processing candidate {candidateNumber}: {e}")
        traceback.print_exc()

    # always increment candidateNumber and continue using the same driver
    candidateNumber += 1

# after loop, quit driver and concatenate results
try:
    driver.quit()
except Exception:
    pass

if dfs:
    result_df = pd.concat(dfs, ignore_index=True).fillna('')
    display(result_df.head())
else:
    print('No data collected')

Started Chrome via Selenium Manager
Processed candidate 72484: Simon Gonçalves, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72484: Simon Gonçalves, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72485: Tobias Kipfer, Parteilos, 65 QAs
Processed candidate 72485: Tobias Kipfer, Parteilos, 65 QAs
Processed candidate 72486: Daniel Suter, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72486: Daniel Suter, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72487: Andreas Graber, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72487: Andreas Graber, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72488: Markus Kasser, Eidgenössisch-Demokratische Union, 65 QAs
Processed candidate 72488: Markus Kasser, Eidgenössisch-Demokratische Union, 65 QAs
Candidate 72489: no answers found, skipping
Candidate 72489: no answers found, skipping
Candidate 72490: no answers found, skipping
Candidate 72490: no answers found, s

Unnamed: 0,Name,Party,Stiftung gemeinnütziger Wohnungsbau,Separativer Unterricht,Kündigung Bilaterale Verträge,Ausbau Stadion Schützenwiese,Kostenmiete statt Marktmiete,Public-Private-Partnerships,Verdichtetes Bauen,Bau von Windkraftanlagen,...,Massnahmen gegen Littering,Massnahmen gegen Drogenkonsum,Stärkeres Engagement gegen häusliche Gewalt,Erhöhte Polizeipräsenz,Staatliche Umverteilung,Bestrafung von Kriminellen,Umweltschutz,About us,Methods,Partners & media
0,Simon Gonçalves,Eidgenössisch-Demokratische Union,Yes (data-value=100),No (data-value=0),Rather no (data-value=25),Rather yes (data-value=75),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),...,Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),No (data-value=0),7 (data-value=100),1 (data-value=0),7 (data-value=100),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
1,Tobias Kipfer,Parteilos,Rather no (data-value=25),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),Yes (data-value=100),Rather no (data-value=25),...,Rather yes (data-value=75),Yes (data-value=100),Yes (data-value=100),Yes (data-value=100),2 (data-value=17),6 (data-value=83),2 (data-value=17),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
2,Daniel Suter,Eidgenössisch-Demokratische Union,Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),Rather no (data-value=25),Yes (data-value=100),No (data-value=0),...,Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),1 (data-value=0),6 (data-value=83),1 (data-value=0),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
3,Andreas Graber,Eidgenössisch-Demokratische Union,Rather yes (data-value=75),Rather yes (data-value=75),Yes (data-value=100),Rather no (data-value=25),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather no (data-value=25),...,Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),4 (data-value=50),5 (data-value=67),3 (data-value=33),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
4,Markus Kasser,Eidgenössisch-Demokratische Union,Rather yes (data-value=75),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),No (data-value=0),...,Yes (data-value=100),Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),2 (data-value=17),5 (data-value=67),4 (data-value=50),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...


In [65]:
result_df = pd.concat(dfs, ignore_index=True).fillna('')
result_df

Unnamed: 0,Name,Party,Stiftung gemeinnütziger Wohnungsbau,Separativer Unterricht,Kündigung Bilaterale Verträge,Ausbau Stadion Schützenwiese,Kostenmiete statt Marktmiete,Public-Private-Partnerships,Verdichtetes Bauen,Bau von Windkraftanlagen,...,Massnahmen gegen Littering,Massnahmen gegen Drogenkonsum,Stärkeres Engagement gegen häusliche Gewalt,Erhöhte Polizeipräsenz,Staatliche Umverteilung,Bestrafung von Kriminellen,Umweltschutz,About us,Methods,Partners & media
0,Simon Gonçalves,Eidgenössisch-Demokratische Union,Yes (data-value=100),No (data-value=0),Rather no (data-value=25),Rather yes (data-value=75),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),...,Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),No (data-value=0),7 (data-value=100),1 (data-value=0),7 (data-value=100),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
1,Tobias Kipfer,Parteilos,Rather no (data-value=25),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),Yes (data-value=100),Rather no (data-value=25),...,Rather yes (data-value=75),Yes (data-value=100),Yes (data-value=100),Yes (data-value=100),2 (data-value=17),6 (data-value=83),2 (data-value=17),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
2,Daniel Suter,Eidgenössisch-Demokratische Union,Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),Rather no (data-value=25),Yes (data-value=100),No (data-value=0),...,Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),1 (data-value=0),6 (data-value=83),1 (data-value=0),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
3,Andreas Graber,Eidgenössisch-Demokratische Union,Rather yes (data-value=75),Rather yes (data-value=75),Yes (data-value=100),Rather no (data-value=25),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather no (data-value=25),...,Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),4 (data-value=50),5 (data-value=67),3 (data-value=33),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
4,Markus Kasser,Eidgenössisch-Demokratische Union,Rather yes (data-value=75),Yes (data-value=100),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),No (data-value=0),...,Yes (data-value=100),Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),2 (data-value=17),5 (data-value=67),4 (data-value=50),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,Dominic Schmid,Sozialdemokratische Partei,Yes (data-value=100),Rather no (data-value=25),No (data-value=0),Yes (data-value=100),Yes (data-value=100),Rather yes (data-value=75),Yes (data-value=100),Rather yes (data-value=75),...,Rather no (data-value=25),No (data-value=0),Yes (data-value=100),Rather no (data-value=25),6 (data-value=83),1 (data-value=0),7 (data-value=100),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
381,Vincenzo Sartori,JungsozialistInnen,Yes (data-value=100),No (data-value=0),No (data-value=0),Rather yes (data-value=75),Yes (data-value=100),No (data-value=0),Rather yes (data-value=75),Rather yes (data-value=75),...,No (data-value=0),No (data-value=0),Yes (data-value=100),No (data-value=0),7 (data-value=100),1 (data-value=0),7 (data-value=100),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
382,Sarah Bolleter,Sozialdemokratische Partei,Yes (data-value=100),Rather no (data-value=25),No (data-value=0),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Rather yes (data-value=75),Yes (data-value=100),...,Rather no (data-value=25),Rather no (data-value=25),Yes (data-value=100),Rather no (data-value=25),6 (data-value=83),1 (data-value=0),5 (data-value=67),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...
383,Kim Eglin,Sozialdemokratische Partei,Yes (data-value=100),Rather no (data-value=25),No (data-value=0),No (data-value=0),Yes (data-value=100),Rather no (data-value=25),Rather yes (data-value=75),Yes (data-value=100),...,Rather yes (data-value=75),Rather no (data-value=25),Yes (data-value=100),No (data-value=0),7 (data-value=100),1 (data-value=0),7 (data-value=100),Project & team\n FAQ\n ...,Questionnaire\n Voting recomm...,Platform partners\n Media cov...


In [66]:
result_df.to_csv('../data/raw/candidates_answers.csv', index=False)