# Parliament Hackathon 2024 - Data Wrangling Jupyter Notebook
## Setup

In [16]:
import pandas
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime
import re

# Baseline namespace information
namespace = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace", "XInclude": "http://www.w3.org/2001/XInclude"}

## Getting ParlaMint Speeches

In [21]:
def get_parlamint_xml_speech(input_path, country_id = ""):
    """"
    Given an input path to an xml ParlaMint file, export a pandas dataframe with its speeches, speech id, and a person id, potentially modified per country.
    :param input_path: Input file path.
    :param country_id: Country_id (i.e. "GB") to add at the end of the person_id in the dataframe.
    :return: A pandas dataframe with speeches, speech id, and person id.
    """
    # Setup - Getting root and dict info
    root = ET.parse(input_path).getroot()
    export_speech_dict = {"speech_id": [], "speech": [], "person_id": []}
    
    # Searching through all tei speeches
    for elem in root.findall("tei:text/tei:body/tei:div/tei:u", namespace):
        export_speech_dict["speech_id"].append(elem.attrib[f"{{{namespace['xml']}}}id"]) # Add speech_id
        try: # Add person_id (checking if country_id exists, and if an error (as a person_id may not exist) add a blank category.
            if country_id:
                export_speech_dict["person_id"].append(f"{country_id}-{elem.attrib['who'][1:]}")
            else:
                export_speech_dict["person_id"].append(elem.attrib['who'][1:])
        except KeyError:
            export_speech_dict["person_id"].append("")
            
        # Sorting through speech segments.
        u_speech = ""
        for seg in elem.findall("tei:seg", namespace):
            seg_text = seg.text
            # Not all segments end in a period. To ensure that splitting text by sentence goes well, and that sentences don't combine, a period is added to the end of each segment.
            if seg_text[-1] not in [".", "?", "!"]:
                seg_text += "."
           # If there is no segment originally added to the total speech, add it with no starting space. If there is, then there will be a starting space (so the sentence is split).
            if not u_speech:
                u_speech += seg_text
            else:
                u_speech += f" {seg_text}"
        export_speech_dict["speech"].append(u_speech) # Adding speech
    return pd.DataFrame(export_speech_dict)

def get_corpus_list_speeches(corpus_list, input_path, export_path):
    """
    Given ParlaMint corpuses, output a csv and parquet file containing all of their speeches..
    :param corpus_list: The corpus that are used to get ParlaMint data (i.e. "ParlaMint-GB", since all files begin with this).
    :param input_path: Where the files come from.
    :param export_path: Where the files should be outputted.
    """
    # Begin by looking through the corpus list.
    for corpus in corpus_list:
        # Base info, adjusted later
        xml_file_paths = []
        parlamint_speeches = pandas.DataFrame(columns = ["speech_id", "speech", "person_id"])
        
        # Get all of the xml file paths from the xml file.
        base_xml_path = f"{input_path}\\{corpus}\\{corpus}.TEI"
        xml_info_path = Path(f"{base_xml_path}\\{corpus}.xml")
        root = ET.parse(xml_info_path).getroot()
        for elem in root.findall("XInclude:include", namespace): # Loop through the root to find them
            xml_file_paths.append(Path(f"{base_xml_path}\\{elem.attrib["href"]}"))
        
        # With all of the xml file paths, begin to search through all links to find all speeches, before concating the together.
        number_xmls = len(xml_file_paths)
        print(f"Beginning {corpus} search with {number_xmls} files.")
        for i, file_path in enumerate(xml_file_paths):
            # print(f"Finished search {i+1} out of {number_xmls}.") # Lets you see how many instances are being fulfilled.
            new_speech = get_parlamint_xml_speech(file_path, corpus[-2:])
            parlamint_speeches = pd.concat([parlamint_speeches, new_speech])
        
        # Export to csv and parquet.
        parlamint_speeches.to_csv(f"{export_path}\\{corpus}-RawSpeechesCSV.csv", index=False)
        parlamint_speeches.to_parquet(f"{export_path}\\{corpus}-RawSpeechesParquet.gzip", index=False, compression='gzip')


        
all_corpus = ["ParlaMint-BA", "ParlaMint-BE", "ParlaMint-BG", "ParlaMint-CZ", "ParlaMint-DK", "ParlaMint-EE", "ParlaMint-ES", "ParlaMint-ES-CT", "ParlaMint-ES-GA", "ParlaMint-ES-PV", "ParlaMint-FI", "ParlaMint-FR", "ParlaMint-GR", "ParlaMint-HR", "ParlaMint-HU", "ParlaMint-IS", "ParlaMint-IT", "ParlaMint-LV", "ParlaMint-NL", "ParlaMint-NO", "ParlaMint-PL", "ParlaMint-PT", "ParlaMint-RS", "ParlaMint-SE", "ParlaMint-SI", "ParlaMint-TR", "ParlaMint-UA"]

# base_legislative_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-taxonomy-parla.legislature.xml"
# person_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listPerson.xml"
# org_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listOrg.xml"

#corpus_list_test = ["ParlaMint-AT", "ParlaMint-GB"]

base_path = "D:\\ParlaMint Data\\Files"
output_path = "D:\\ParlaMint Data\\Raw Data"

# Gets all of the corpus lists (currently off due to finishing before)
# get_corpus_list_speeches(all_corpus, base_path, output_path)


## ParlaMint Data Cleaning

In [4]:
new_dataframe = pd.read_parquet("D:\\ParlaMint Data\\Raw Data\\ParlaMint-GB-RawSpeechesParquet.gzip")
new_dataframe


Unnamed: 0,speech_id,speech,person_id
0,ParlaMint-GB_2015-01-05-commons.u1,1. What progress her Department has made on im...,GB-JenniferWillott
1,ParlaMint-GB_2015-01-05-commons.u2,The Government are on track to deliver their c...,GB-TheresaMay
2,ParlaMint-GB_2015-01-05-commons.u3,"It is clear that exit checks, which were scrap...",GB-JenniferWillott
3,ParlaMint-GB_2015-01-05-commons.u4,"As I indicated in my original answer, we are o...",GB-TheresaMay
4,ParlaMint-GB_2015-01-05-commons.u5,19. Given the situation at our border in Calai...,GB-AlexanderCunningham
...,...,...,...
670907,ParlaMint-GB_2022-07-21-lords.u209,I will have to check that point for the noble ...,GB-JoannaPenn
670908,ParlaMint-GB_2022-07-21-lords.u210,"My Lords, the Minister has referred a number o...",GB-PremSikka
670909,ParlaMint-GB_2022-07-21-lords.u211,The noble Lord is right that different aspects...,GB-JoannaPenn
670910,ParlaMint-GB_2022-07-21-lords.u212,I thank noble Lords for a very interesting deb...,GB-PremSikka


In [20]:
#gb_data = new_dataframe.copy().sample(frac=0.05, replace=True, random_state=1)
all_corpus = ["ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Raw Data"
output_path = "D:\\ParlaMint Data\\Cleaned Data"

def clean_data(corpus_list, input_path, export_path):
    for corpus in corpus_list:
        speech_data = pd.read_parquet(f"{input_path}\\{corpus}-RawSpeechesParquet.gzip")
        #speech_data = pd.read_parquet("D:\\ParlaMint Data\\Raw Data\\ParlaMint-GB-RawSpeechesParquet.gzip")
        
        def clean_text(text):
            text = re.sub(r'\(.*?\)', '', text)  # Remove interpellations (text within parentheses)
            text = re.sub(r'-', '', text)  # Remove hyphens
            text = re.sub(r'_', '', text)  # Remove underscores
            return text.strip()
        
        def split_text(text):
            return re.split(r'(?<!\d)[.!?]\s+', text) # Splitting by ., !, or ?
        
        print(f"Part 1: Cleaning Speech at {datetime.datetime.now()}")
        speech_data['speech'] = speech_data['speech'].apply(clean_text)
        
        print(f"Part 2: Exploding Speech at {datetime.datetime.now()}")
        gb_data = gb_data.assign(speech=gb_data['speech'].apply(split_text)).explode('speech')
        
        print(f"Part 3: Removing Small Text at {datetime.datetime.now()}")
        gb_data = gb_data[gb_data['speech'].str.len() > 30]
        
        print(f"Part 4: Resetting Index at {datetime.datetime.now()}")
        gb_data = gb_data.reset_index(drop=True).reset_index()
        
        print(f"Part 5: Making New Speech ID at {datetime.datetime.now()}")
        gb_data["speech_id"] = gb_data["speech_id"] + "-" + gb_data["index"].astype(str)
        
        print(f"Final: Showing Info at {datetime.datetime.now()}")
        
        parlamint_speeches.to_parquet(f"{export_path}\\{corpus}-CleanedSpeechesParquet.gzip", index=False, compression='gzip')
        


Part 1: Cleaning Speech at 2024-05-21 11:12:05.501402
Part 2: Exploding Speech at 2024-05-21 11:12:06.120787
Part 3: Removing Small Text at 2024-05-21 11:12:08.916071
Part 4: Resetting Index at 2024-05-21 11:12:09.163345
Part 5: Making New Speech ID at 2024-05-21 11:12:09.272992
Final: Showing Info at 2024-05-21 11:12:09.471333


Unnamed: 0,index,speech_id,speech,person_id
0,0,ParlaMint-GB_2020-07-15-commons.u212-0,Friend join me in congratulating Liberal Democ...,GB-JacobYoung
1,1,ParlaMint-GB_2020-05-06-lords.u32-1,"My Lords, the noble Baroness refers to the sus...",GB-ElizabethSugg
2,2,ParlaMint-GB_2020-05-06-lords.u32-2,I have already mentioned the £150 million goin...,GB-ElizabethSugg
3,3,ParlaMint-GB_2020-05-06-lords.u32-3,We are also working closely with the private s...,GB-ElizabethSugg
4,4,ParlaMint-GB_2020-07-14-commons.u459-4,"The Housing, Communities and Local Government ...",GB-RushanaraAli
...,...,...,...,...
256086,256086,ParlaMint-GB_2016-01-06-commons.u374-256086,I was amazed that residents affected were in c...,GB-RichardArkless
256087,256087,ParlaMint-GB_2016-01-06-commons.u374-256087,We should never take that resilience for grant...,GB-RichardArkless
256088,256088,ParlaMint-GB_2016-01-06-commons.u374-256088,"The weather is not going to get any better, so...",GB-RichardArkless
256089,256089,ParlaMint-GB_2016-09-15-lords.u2-256089,"My Lords, the Government have halted the decli...",GB-NicholasBourne


In [None]:
gb_data.explode("speech")

In [55]:

# Getting the Party Information
def find_party():
    tree = ET.parse(base_legislative_path)
    root = tree.getroot()
    for elem in root:
        print(elem.tag, elem.attrib)
        
        

results = {}
tree = ET.parse(person_path)
root = tree.getroot()
# Setting up TEI namespace

#"{http://www.tei-c.org/ns/1.0}person"
for elem in root.findall("tei:person", namespace):
    # Getting name_ids
    name_id = elem.attrib[f"{{{namespace['xml']}}}id"]
    print(name_id)
    
    # Getting gender
    gender = [gen.attrib["value"] for gen in elem.findall("tei:sex", namespace)][0]    
    
    # Getting affiliations (has party, member role, year)
    affiliations = [affil.attrib for affil in elem.findall("tei:affiliation", namespace)]
    roles = set()
    parties = set()
    government = set()
    for affiliation in affiliations:
        if affiliation == affiliations[0]:
            term_start = affiliation["from"]
        try:
            if "party" in affiliation["ref"]:
                parties.add(affiliation["ref"])
            else:
                government.add(affiliation["ref"])
        except KeyError as e:
            pass
        roles.add(affiliation["role"])

    # print(roles)
    # print(parties)
    # print(government)
    # print(term_start)
    
    # Getting the full name of the individual
    forename = ' '.join([name.text for name in elem.findall('tei:persName/tei:forename', namespace)])
    surname = ' '.join([name.text for name in elem.findall('tei:persName/tei:surname', namespace)])
    name = f"{forename}, {surname}"
    
    # Updating the results dictionary
    results.update({"name_id": name_id, "name": name, "gender": gender, "term_start": term_start, "roles": roles, "parties": parties, "government": government, "affiliations": affiliations})
    
    #print("exception")
    #print(results)
    # print(affiliations)
    # print(name)
    # for person 
    # 
    # in list(elem):
    #     # print(person.tag, person.attrib)
    #     #print(person.tag, person.attrib)
    # #print(elem.tag, elem.attrib)
    #     if elem.tag == "{http://www.tei-c.org/ns/1.0}head":
    #         # The header contains the debate topic. Attribute notes which header it is.
    #         #print(elem.attrib)
    #         debate_topic = elem.text.strip()
    #         print(elem.text.strip())
    #     elif elem.tag == "{http://www.tei-c.org/ns/1.0}u":
    #         #print(elem.attrib)
    #         pass
    #     #print(elem.tag)
    #     try:
    #         pass
    #         #print(elem.attrib["{http://www.w3.org/XML/1998/namespace}id"])
    #     except Exception as e:
    #         pass
    #     # print(elem.tag, "hola",elem.attrib)
    #     try:
    #         if elem.text.strip():
    #             pass
    #             # print(elem.tag)
    #             # print(elem.attrib)
    #             # print(elem.text.strip())
    #             #results[elem.tag] = elem.text.strip()
    #     except Exception as e:
    #         pass
    #         #print(e)

MargaretProsser
JonathanMendelsohn
AlanMak
RuthHunt
JoannaCherry
WilliamJordan
MayBlood
TheresaVilliers
GeorgeGalloway
PaulWhite
JohnSpellar
BeebanKidron
AnnaTurley
GeraintDavies
TimothyYeo
PeterHeatonJones
GrevilleHoward
NicholasHoltam
DavidLaws
JohnGardiner
EleanorReeves
RobertMay
EmmaLewellBuck
GillianKeegan
MatthewWestern
MargotAline
SebastianCoe
BrendaDean
SueEllenBraverman
KatyClark
RobertFlello
GavinShuker
StewartJackson
MarkEastwood
RogerGale
MelanieOnn
AngelaHarris
DonaldMackay
CharlesDugdale
JulianSmith
JamesGray
ThomasBrake
ReginaldEmpey
GarryHart
TaniaMathias
LisaForbes
HilaryBenn
LaurenceRobertson
JeanBarker
ThomasCoke
DianaEccles
JaneHunt
MarkHunter
RobertSyms
TobiasEllwood
LeoDocherty
SallyAnnHart
TomGreatrex
AndrewGriffiths
ErnestOxburgh
BrooksNewmark
AlanBeith
DavidRichards
VictoriaAtkins
AnnMcKechin
RobertMaclennan
CharlesHendry
BrettElphicke
MiriamCates
TracyBrabin
MichaelMorris
PaulCondon
SallyOppenheimBarnes
PaulMurphy
ThangamDebbonaire
GavinBarwell
EwenCameron
Pat

In [7]:
find_party()

{http://www.tei-c.org/ns/1.0}desc {'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
{http://www.tei-c.org/ns/1.0}category {'{http://www.w3.org/XML/1998/namespace}id': 'parla.geo-political'}
{http://www.tei-c.org/ns/1.0}category {'{http://www.w3.org/XML/1998/namespace}id': 'parla.organization'}
{http://www.tei-c.org/ns/1.0}category {'{http://www.w3.org/XML/1998/namespace}id': 'parla.term'}
