In [14]:
import pandas 
import requests
from bs4 import BeautifulSoup
import html2text
import time

In [21]:
import openai
from openai import OpenAI
import os
import json
from datetime import datetime

with open("/Users/finn/Documents/keys/openai", "r") as f:
    key = f.read().strip()
    openai.api_key = key
    # set OPENAI_API_KEY environment variable to the key
    os.environ["OPENAI_API_KEY"] = key

client = OpenAI()

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_elections_in_2024#Europe"

req = requests.get(url)

soup = BeautifulSoup(req.content, 'html.parser')

In [None]:
# Identify all <li> which have a <span class="flagicon"> child
li_elements = soup.find_all('li')
li_elements = [li for li in li_elements if li.find('span', class_='flagicon')]
li_elements[0]

elections = []
for li in li_elements:
    country_name = li.find('a').text
    # find the <ul> which is the elections
    ul = li.find('ul')
    # find all <li> which are the elections
    for li in ul.find_all('li'):
        election_name = li.text
        link = li.find('a').get('href')
        elections.append({
            'country': country_name,
            'election': election_name,
            'url': link
        })

df = pandas.DataFrame(elections)
df.query("country == 'United States'")

# TODO: Filter for just OECD elections - try using PyCountry or similar to get the country codes

# TODO: Pass this to ChatGPT or the API or the OpenAI playground and get it to identify just the national legislative/head of state/government elections


Unnamed: 0,country,election,url
44,United States,"2024 United States elections, 5 November\n2024...",/wiki/2024_United_States_elections
45,United States,2024 United States gubernatorial elections,/wiki/2024_United_States_gubernatorial_elections
46,United States,2024 United States House of Representatives el...,/wiki/2024_United_States_House_of_Representati...
47,United States,2024 United States presidential election,/wiki/2024_United_States_presidential_election
48,United States,2024 United States Senate elections,/wiki/2024_United_States_Senate_elections
49,United States,2024 United States state legislative elections,/wiki/2024_United_States_state_legislative_ele...


In [16]:
# Dump the files to /election_html_dump

for i, election in enumerate(elections):
    if i % 10 == 0:
        print(f"{i}/{len(elections)}")
    url = f"https://en.wikipedia.org{election['url']}"
    req = requests.get(url)
    with open(f"election_html_dump/{election['url'].replace('/wiki/', '')}.html", 'w') as f:
        f.write(req.text)
    time.sleep(0.2) 

0/181
10/181
20/181
30/181
40/181
50/181
60/181
70/181
80/181
90/181
100/181
110/181
120/181
130/181
140/181
150/181
160/181
170/181
180/181


In [18]:
# convert to markdown
for i, election in enumerate(elections):
    if i % 10 == 0:
        print(f"{i}/{len(elections)}")
    with open(f"election_html_dump/{election['url'].replace('/wiki/', '')}.html", 'r') as f:
        html = f.read()
    h = html2text.HTML2Text()
    h.ignore_links = True
    markdown = h.handle(html)
    with open(f"election_md_dump/{election['url'].replace('/wiki/', '')}.md", 'w') as f:
        f.write(markdown)

0/181
10/181
20/181
30/181
40/181
50/181
60/181
70/181
80/181
90/181
100/181
110/181
120/181
130/181
140/181
150/181
160/181
170/181
180/181


In [30]:

# Send it off to the OpenAI API

for i, election in enumerate(elections):

    if i % 10 == 0:
        print(f"{i}/{len(elections)}")

    with open(f"election_md_dump/{election['url'].replace('/wiki/', '')}.md", 'r') as f:
        markdown = f.read()
    
    prompt = f"""
        Following is markdown text for an election. Identify the swing that the incumbent party faced. Return data as JSON:
        {{
            "election": "2024 X presidential election",
            "party": "party name",
            "country": "country name",
            "iso3" : e.g. "USA",
            "democracy": free | partly free | not free,
            "economy": "developed | developing | emerging",
            "election_type": national legislative | local | individual head of state | individual head of government | multinational (e.g. EU) | other",
            "swing": 0.04,
            "message": space for possible acknowledgements or notes (e.g. note if there isn't a comparable election)
        }}

        Page text:
        {markdown}
    """



    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt}
        ],
        temperature=0.2,  # Reduced temperature for deterministic output
        max_tokens=4095,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={"type": "json_object"}
    )

    with open(f"openai_dump/{election['url'].replace('/wiki/', '')}.json", 'w') as f:
        json.dump(response.to_dict(), f)

0/181
10/181
20/181
30/181
40/181
50/181
60/181
70/181
80/181
90/181
100/181
110/181
120/181
130/181
140/181
150/181
160/181
170/181
180/181


In [27]:
response.to_dict()

{'id': 'chatcmpl-AQzqRRPEIzFdi4ppYZKjy7m418vD4',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': '{\n    "election": "2024 Tuvaluan general election",\n    "party": "Independents",\n    "swing": 0.0,\n    "message": "There are no political parties in Tuvalu and all candidates run as independents, so there is no comparable swing for a party."\n}',
    'refusal': None,
    'role': 'assistant'}}],
 'created': 1730997623,
 'model': 'gpt-4o-2024-08-06',
 'object': 'chat.completion',
 'system_fingerprint': 'fp_159d8341cc',
 'usage': {'completion_tokens': 66,
  'prompt_tokens': 7691,
  'total_tokens': 7757,
  'prompt_tokens_details': {'cached_tokens': 7552, 'audio_tokens': 0},
  'completion_tokens_details': {'reasoning_tokens': 0,
   'audio_tokens': 0,
   'accepted_prediction_tokens': 0,
   'rejected_prediction_tokens': 0}}}

In [24]:
response

ChatCompletion(id='chatcmpl-AQznWyL4LWEI9R42TsQ4TAzr0zQXM', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "election": "2024 Tuvaluan general election",\n    "party": "Independents",\n    "swing": 0.0,\n    "message": "There are no political parties in Tuvalu and all candidates run as independents, so there is no comparable swing for an incumbent party."\n}', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1730997442, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_159d8341cc', usage=CompletionUsage(completion_tokens=67, prompt_tokens=7691, total_tokens=7758, prompt_tokens_details={'cached_tokens': 0, 'audio_tokens': 0}, completion_tokens_details={'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}))

In [6]:
li

<li><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="1200" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/commons/thumb/3/38/Flag_of_Tuvalu.svg/23px-Flag_of_Tuvalu.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/38/Flag_of_Tuvalu.svg/35px-Flag_of_Tuvalu.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/38/Flag_of_Tuvalu.svg/46px-Flag_of_Tuvalu.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Tuvalu" title="Tuvalu">Tuvalu</a>
<ul><li><a href="/wiki/2024_Tuvaluan_general_election" title="2024 Tuvaluan general election">2024 Tuvaluan general election</a>, 26 January</li></ul></li>