By customizing this code, users can download the raw text as well as the speaker information from the UK's House of Commons parliamentary debates as available on the They Work For You online repository.

In this example, the code is set to scrape data from the 1970s.

In [None]:
## Load libraries

from bs4 import BeautifulSoup, SoupStrainer
import urllib.request 
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import xml.etree.ElementTree as ET

In [2]:
## Define the URL to scrape data from

URL = "https://www.theyworkforyou.com/pwdata/scrapedxml/debates/"

response = urllib.request.urlopen(URL)
soup = BeautifulSoup(response.read())


In [3]:
all_urls = []
for link in soup.find_all('a', href=True):
    if '.xml' in link['href']:
        all_urls.append(URL + link['href'])

In [None]:
for u in all_urls:
    if any(f"debates{i}" in u for i in range(1970, 1980)):  
        filename = u.split('/')[-1]
        urllib.request.urlretrieve(u, filename)

In [None]:
final_dataset = []

for u in all_urls:
    if any(f"debates{i}" in u for i in range(1970, 1980)):
        filename = u.split('/')[-1]

        try: 
            tree = ET.parse(filename)
            root = tree.getroot()
        except ET.ParseError:
            print("Error parsing file:", filename)
            continue

        for speech in root.findall('.//speech[@speakername]'):
            name = speech.get('speakername')
            speechid = speech.get('id')
            hansard_membership_id = speech.get("hansard_membership_id")
            speaker_id = speech.get('speakerid')
            text = ' '.join([p.text.strip() for p in speech.findall('.//p') if p.text])
            final_dataset.append((speechid, name, speaker_id, hansard_membership_id, text))

In [None]:
df = pd.DataFrame(final_dataset)

In [None]:
df.columns = ["speech_id_link", "speaker", "twfy_member_id", "hansard_id", "text"]

The resulting dataframe contains the following information, based on the scraped XML tags:

- **speech_id_link**: the unique link identifier of the speech, which also contains the date in which the speech was made
- **speaker**: the speaker of the speech as recorded in the scraped XML tags
- **twfy_member_id**: TWFY member ID, a unique identifier for the speaker
- **hansard_id**: Hansard ID, a unique identifier for the speaker
- **text**: the text of the speech

In [None]:
# save the dataframe to a pickle file (.pkl format). This has the advantage of being faster to process than a CSV file.

df.to_pickle("uk_debates_1970_1979.pkl")