## Setup

In [9]:
import pandas
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime
import re

# Baseline namespace information
namespace = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace", "XInclude": "http://www.w3.org/2001/XInclude"}

base_legislative_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-taxonomy-parla.legislature.xml"
person_path = "D:\\ParlaMint Data\\Files\\ParlaMint-GB\\ParlaMint-GB.TEI\\ParlaMint-GB-listPerson.xml"
org_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listOrg.xml"


## Getting Base Taxonomy Files

In [89]:
all_corpus = ["ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Files"
output_path = "C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files"
taxonomy_base = "D:\\ParlaMint Data\\Taxonomy"
#f"{input_path}\\{corpus}-RawSpeechesParquet.gzip"
        
def get_political_orientation_taxonomy(input_path, export_path):
    """"Given an input_path to the ParlaMint xml political orientation taxonomy file, find all instances of party orientation and codes and export it as a csv.
    :param input_path: File import path.
    :param export_path: Where to output the path."""
    root = ET.parse(f"{input_path}\\ParlaMint-taxonomy-politicalOrientation.xml").getroot()
    categories = []
    for elem in root.findall("tei:category", namespace):
        categories.append(
            [elem.attrib[f"{{{namespace['xml']}}}id"], elem.find("tei:catDesc/tei:term", namespace).text]
        )
        party_orientation_taxonomy = pandas.DataFrame(categories, columns = ["parla_tag", "party_orientation"])
        #legislature_taxonomy.to_csv(f"{export_path}\\party_orientation_taxonomy.csv", index=False)
    return party_orientation_taxonomy
        
get_political_orientation_taxonomy(taxonomy_base, output_path)

Unnamed: 0,parla_tag,party_orientation
0,orientation.L,Left
1,orientation.C,Centre
2,orientation.R,Right
3,orientation.FL,Far-left
4,orientation.FR,Far-right
5,orientation.CL,Centre-left
6,orientation.CR,Centre-right
7,orientation.CCL,Centre to centre-left
8,orientation.CCR,Centre to centre-right
9,orientation.CLL,Centre-left to left


## Finding Country Metadata

In [127]:
input_path = "D:\\ParlaMint Data\\Files"
#party_orientation_taxonomy = pd.read_csv("C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files\\party_orientation_taxonomy.csv")
#pd.read_csv("C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files\\legislature_taxonomy.csv")

party_orientation_taxonomy = get_political_orientation_taxonomy(taxonomy_base, output_path)
def get_corpus_party_info(corpus, input_path, taxonomy):
    """"Given an input_path to the ParlaMint xml political orientation taxonomy file, find all instances of party orientation and codes and export it as a csv.
    :param input_path: File import path.
    :param export_path: Where to output the path."""
    root = ET.parse(f"{input_path}\\{corpus}\\{corpus}.TEI\\{corpus}-listOrg.xml").getroot()
    categories = []
    parties = {"party_tag": [], "full_name": [], "abv_name": [], "parla_tag": []}
    parliaments = {"parliament_tag": [], "house_type": []}
    for elem in root.findall("tei:org", namespace):
        if elem.attrib["role"] == "parliament":
            parliaments["parliament_tag"].append(elem.attrib[f"{{{namespace['xml']}}}id"])
            if "upper" in elem.attrib["ana"]:
                parliaments["house_type"].append("Upper")
            elif "lower" in elem.attrib["ana"]:
                parliaments["house_type"].append("Lower")
            else:
                parliaments["house_type"].append("Unknown")
        elif elem.attrib["role"] == "politicalParty":
            parties["party_tag"].append(elem.attrib[f"{{{namespace['xml']}}}id"])
            parties["full_name"].append(elem.find(".//tei:orgName[@full='yes']", namespace).text)
            parties["abv_name"].append(elem.find(".//tei:orgName[@full='abb']", namespace).text)
            try:
                parties["parla_tag"].append(elem.find(".//tei:state[@type='politicalOrientation']/tei:state", namespace).attrib["ana"][1:])
            except AttributeError:
                parties["parla_tag"].append("NONE")
        else:
            pass
    #print(parties)
        #print(elem.tag, elem.attrib)
    # for elem in root.findall("tei:category", namespace):
    #     categories.append(
    #         [elem.attrib[f"{{{namespace['xml']}}}id"], elem.find("tei:catDesc/tei:term", namespace).text]
    #     )
    #print(parties)
    # help = party_info[party_info["party_ideology"].isin(taxonomy["parla_tag"])]
    # display(help)
        #legislature_taxonomy.to_csv(f"{export_path}\\party_orientation_taxonomy.csv", index=False)
    #display(taxonomy)
    party_info = pandas.DataFrame(parties).merge(taxonomy, how='left').drop(columns=["parla_tag"])
    parliaments_info = pandas.DataFrame(parliaments)
    return party_info, parliaments_info
        
party_info, parliaments_info = get_corpus_party_info("ParlaMint-GB", input_path, party_orientation_taxonomy)
display(party_info)
display(parliaments_info)

Unnamed: 0,party_tag,full_name,abv_name,party_orientation
0,party.PAUB,Alba Party,PAUB,Centre-left
1,party.QHMP,Conservative Independent,QHMP,
2,party.SO0T,Other,SO0T,
3,party.LD,Liberal Democrat,LD,Centre to centre-left
4,party.64RT,Respect,64RT,Left
5,party.SDLP,Social Democratic & Labour Party,SDLP,Centre-left
6,party.L1QU,Liberal Democrat Independent,L1QU,Centre to centre-left
7,party.0UBS,Independent Conservative,0UBS,
8,party.BI,Bishops,BI,
9,party.LI,Labour Independent,LI,Left


Unnamed: 0,parliament_tag,house_type
0,parliament.HC,Lower
1,parliament.HL,Upper


In [138]:

# Getting the Country MetaData Information
all_corpus = ["ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Files"
def country_metadata(corpus_list, input_path):
    for corpus in corpus_list:
        party_info, parliaments_info = get_corpus_party_info(corpus, input_path, get_political_orientation_taxonomy(taxonomy_base, output_path))
        
    person_path = f"{input_path}\\{corpus}\\{corpus}.TEI\\{corpus}-listPerson.xml"
    root = ET.parse(person_path).getroot()        
    
    results = {"name_id": [], "name": [], "gender": [], "term_start": [], "roles": [], "parties": [], "government": [], "affiliations": []}

    for elem in root.findall("tei:person", namespace):
        # Getting name_ids
        results["name_id"].append(f"GB-{elem.attrib[f"{{{namespace['xml']}}}id"]}")
        #print(name_id)
        
        # Getting gender
        results["gender"].append([gen.attrib["value"] for gen in elem.findall("tei:sex", namespace)][0])   
        
        # Getting affiliations (has party, member role, year)
        affiliations = [affil.attrib for affil in elem.findall("tei:affiliation", namespace)]
        roles = set()
        parties = set()
        government = set()
        for affiliation in affiliations:
            if affiliation == affiliations[0]:
                results["term_start"].append(affiliation["from"])
            try:
                if "party" in affiliation["ref"]:
                    parties.add(affiliation["ref"][1:])
                else:
                    government.add(affiliation["ref"][1:])
            except KeyError:
                pass
            results["affiliations"].append(affiliation["role"])
            results["parties"].append(sorted(parties))
            results["government"].append(sorted(government))
    
        # print(roles)
        # print(parties)
        # print(government)
        # print(term_start)
        
        # Getting the full name of the individual
        forename = ' '.join([name.text for name in elem.findall('tei:persName/tei:forename', namespace)])
        surname = ' '.join([name.text for name in elem.findall('tei:persName/tei:surname', namespace)])
        results["name"].append(f"{forename}, {surname}")
        
        # Updating the results dictionary
        # results.update({"name_id": name_id, "name": name, "gender": gender, "term_start": term_start, "roles": roles, "parties": parties, "government": government})
        
    print(results)
        
country_metadata(all_corpus, input_path)

{'name_id': ['GB-MargaretProsser', 'GB-JonathanMendelsohn', 'GB-AlanMak', 'GB-RuthHunt', 'GB-JoannaCherry', 'GB-WilliamJordan', 'GB-MayBlood', 'GB-TheresaVilliers', 'GB-GeorgeGalloway', 'GB-PaulWhite', 'GB-JohnSpellar', 'GB-BeebanKidron', 'GB-AnnaTurley', 'GB-GeraintDavies', 'GB-TimothyYeo', 'GB-PeterHeatonJones', 'GB-GrevilleHoward', 'GB-NicholasHoltam', 'GB-DavidLaws', 'GB-JohnGardiner', 'GB-EleanorReeves', 'GB-RobertMay', 'GB-EmmaLewellBuck', 'GB-GillianKeegan', 'GB-MatthewWestern', 'GB-MargotAline', 'GB-SebastianCoe', 'GB-BrendaDean', 'GB-SueEllenBraverman', 'GB-KatyClark', 'GB-RobertFlello', 'GB-GavinShuker', 'GB-StewartJackson', 'GB-MarkEastwood', 'GB-RogerGale', 'GB-MelanieOnn', 'GB-AngelaHarris', 'GB-DonaldMackay', 'GB-CharlesDugdale', 'GB-JulianSmith', 'GB-JamesGray', 'GB-ThomasBrake', 'GB-ReginaldEmpey', 'GB-GarryHart', 'GB-TaniaMathias', 'GB-LisaForbes', 'GB-HilaryBenn', 'GB-LaurenceRobertson', 'GB-JeanBarker', 'GB-ThomasCoke', 'GB-DianaEccles', 'GB-JaneHunt', 'GB-MarkHunter