## Setup

In [9]:
import pandas
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime
import re

# Baseline namespace information
namespace = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace", "XInclude": "http://www.w3.org/2001/XInclude"}

base_legislative_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-taxonomy-parla.legislature.xml"
person_path = "D:\\ParlaMint Data\\Files\\ParlaMint-GB\\ParlaMint-GB.TEI\\ParlaMint-GB-listPerson.xml"
org_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listOrg.xml"


## Getting Base Taxonomy Files

In [200]:
all_corpus = ["ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Files"
taxonomy_base = "D:\\ParlaMint Data\\Taxonomy"
output_path = "C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files"
        
def get_political_orientation_taxonomy(input_path):
    """"Given an input_path to the ParlaMint xml political orientation taxonomy file, find all instances of party orientation and codes and export it as a csv. Could be adjusted for CHES data.
    :param input_path: File import path.
    :returns: Dataframe with the party id and their party orientation based on ParlaMint data."""
    root = ET.parse(f"{input_path}\\ParlaMint-taxonomy-politicalOrientation.xml").getroot() # Get the root
    categories = []
    for elem in root.findall("tei:category", namespace): # Search through category to get all party tags and their orientation.
        categories.append(
            [elem.attrib[f"{{{namespace['xml']}}}id"], elem.find("tei:catDesc/tei:term", namespace).text]
        )
    return pandas.DataFrame(categories, columns = ["parla_tag", "party_orientation"])


def get_legislature_taxonomy(input_path):
    """"Given an input_path to the ParlaMint xml legislature taxonomy file, find all instances of house categories and codes and export it as a csv.
    :param input_path: File import path.
    :returns: Dataframe with the legislative tag and their house type (i.e. lower, upper)."""
    root = ET.parse(f"{input_path}\\ParlaMint-taxonomy-parla.legislature.xml").getroot() # Get the root
    for elem in root.findall(".//tei:category[@xml:id='parla.organization']", namespace): # Searching through the categories to get those associated with houses.
        categories = [
            [category.attrib[f"{{{namespace['xml']}}}id"], category.find("tei:catDesc/tei:term", namespace).text] for category in elem.findall("tei:category/tei:category", namespace) + elem.findall("tei:category/tei:category/tei:category", namespace)
        ]
        return pandas.DataFrame(categories, columns = ["parla_tag", "house_type"])


## Finding Country Conversion Metadata

In [272]:
input_path = "D:\\ParlaMint Data\\Files"

party_orientation_taxonomy = get_political_orientation_taxonomy(taxonomy_base)
legislature_taxonomy = get_legislature_taxonomy(taxonomy_base)
def get_corpus_party_info(corpus, input_path, party_taxonomy, legislature_taxonomy):
    """"
    Given a corpus, input path, party taxonomy and legislature taxonomy, find two dataframes - one that has that specific corpus results for their parties and their orientations and another that has its parliaments types (i.e. lower, upper).
    :param corpus: Corpus of a country (i.e. "ParlaMint-GB").
    :param input_path: Base path where all of the corpus' are ({input_path}\\{corpus}\\{corpus}.TEI...)
    :param legislature_taxonomy: General ParlaMint legislature taxonomy.
    :param party_taxonomy: General ParlaMint party ideology taxonomy.
    """
    # Setup
    root = ET.parse(f"{input_path}\\{corpus}\\{corpus}.TEI\\{corpus}-listOrg.xml").getroot()
    parties = {"party_tag": [], "full_name": [], "abv_name": [], "parla_tag": []}
    parliaments = {"parliament_tag": [], "house_type": []}
    
    # Looking through all possible orgs in the xml.
    for elem in root.findall("tei:org", namespace):
        if elem.attrib["role"] == "parliament": # If the org is a "parliament", find its id info and add it to parliaments dict.
            parliaments["parliament_tag"].append(elem.attrib[f"{{{namespace['xml']}}}id"])
            split_identifiers = elem.attrib["ana"].replace("#","").split() #Identifiers in xml are annotated together, so need to be split and sorted through the split values.
            for identifier in split_identifiers:
                try:
                    matched_row = legislature_taxonomy[legislature_taxonomy["parla_tag"] == identifier]
                    parliaments["house_type"].append(matched_row["house_type"].values[0])
                except IndexError: # If it fails, then it does not have any relevant annotations.
                    pass
        elif elem.attrib["role"] == "politicalParty" or elem.attrib["role"] == "parliamentaryGroup" or elem.attrib["role"] == "representative": # Otherwise, if the org is associated with parties/parliamentary groups...
            parties["party_tag"].append(elem.attrib[f"{{{namespace['xml']}}}id"]) # Get party tag, full name, abv name, and the ideology tag.
            parties["full_name"].append(elem.find(".//tei:orgName[@full='yes']", namespace).text)
            try:
                parties["abv_name"].append(elem.find(".//tei:orgName[@full='abb']", namespace).text)
            except AttributeError:
                parties["abv_name"].append("nan")
            try:
                parties["parla_tag"].append(elem.find(".//tei:state[@type='politicalOrientation']/tei:state", namespace).attrib["ana"][1:])
            except AttributeError:
                parties["parla_tag"].append("nan")
        else:
            pass
        
    return pandas.DataFrame(parties).merge(party_taxonomy, how='left').drop(columns=["parla_tag"]), pandas.DataFrame(parliaments)
        
party_info, parliaments_info = get_corpus_party_info("ParlaMint-UA", input_path, party_orientation_taxonomy, legislature_taxonomy)
display(party_info)
display(parliaments_info)

Unnamed: 0,party_tag,full_name,abv_name,party_orientation
0,pp.NRU,"Політична партія ""Народний Рух України""",РУХ,Centre-right
1,pp.UNP,Українська народна партія,УНП,Centre-right
2,pp.PP,Партія праці,ПП,Centre-left
3,pp.SDPU,Соціал-демократична партія України (об'єднана),СДПУ(о),Centre to centre-left
4,pp.Hrom,Всеукраїнське об'єднання «Громада»,Громада,Centre-left
...,...,...,...,...
143,fr.ud,"Фракція Політичної партії ""УДАР (Український Д...",фУДАР,Centre
144,fr.sv,"Фракція Всеукраїнське об'єднання ""Свобода""",фСвобода,Right to far-right
145,fr.kpu,Фракція Комуністичної партії України,фКПУ,Far-left
146,gr.eu,"Група ""Суверенна європейська Україна""",гСЄУ,Centre


Unnamed: 0,parliament_tag,house_type
0,ВРУ,Unicameralism


## Getting all Country MetaData Files

In [271]:
taxonomy_base = "D:\\ParlaMint Data\\Taxonomy"
def country_metadata(corpus_list, input_path, export_path, taxonomy_path):
    """
    Given a corpus list, find the metadata for all relevant corpuses and all of their individual speakers in the ParlaMint dataset and output it as a csv.
    :param corpus_list: A list of corpuses to search through (i.e. ["ParlaMint-GB", "ParlaMint-AT"...])
    :param input_path: Base path where all of the corpus' are ({input_path}\\{corpus}\\{corpus}.TEI...)
    :param export_path: Base path where all of the corpus person metadata is outputed (as {export_path}\{corpus}-metadata.csv")
    :param taxonomy_path: Base path where all taxonomy files are kept.
    """
    # Getting taxonomy from base.
    legislature_taxonomy = get_legislature_taxonomy(taxonomy_path)
    political_orientation_taxonomy = get_political_orientation_taxonomy(taxonomy_path)
    
    # Searching through all corpus in a corpus list.
    for corpus in corpus_list:
        # Setup
        print(f"Getting the MetaData for corpus {corpus} at {datetime.datetime.now()}.")
        party_info, parliaments_info = get_corpus_party_info(corpus, input_path, political_orientation_taxonomy, legislature_taxonomy) # Getting the party info for the corpus
        person_path = f"{input_path}\\{corpus}\\{corpus}.TEI\\{corpus}-listPerson.xml"
        root = ET.parse(person_path).getroot()        
        results = {"name_id": [], "name": [], "gender": [], "term_start": [], "roles": [], "parties": [], "party_orientation": [], "legislative_branch": []}
        
        # Searching through all people
        for elem in root.findall("tei:person", namespace):
            # Getting name_ids
            results["name_id"].append(f"{corpus[-2:]}-{elem.attrib[f"{{{namespace['xml']}}}id"]}")
            
            # Getting gender
            try:
                results["gender"].append([gen.attrib["value"] for gen in elem.findall("tei:sex", namespace)][0])
            except IndexError:
                results["gender"].append("nan")
            
            # Getting affiliations (has party, member role, year). Place certain values in sets to get only unique information.
            affiliations = [affil.attrib for affil in elem.findall("tei:affiliation", namespace)]
            roles = set()
            parties = set()
            legislative_branch = set()
            party_orientation = []
            
            # Find the start term. XML organization is uncertain, so converting year-month-day into an integer and then seeing if its less then the current term_start value. If it is, then  consider this now their starting term. Should find the earliest date.
            term_start = 90000000
            term_start_str = "nan"
            try:
                for affiliation in affiliations:
                    if int(affiliation["from"][:10].replace("-","")) < term_start:
                        term_start = int(affiliation["from"][:10].replace("-",""))
                        term_start_str = affiliation["from"][:10]
                results["term_start"].append(term_start_str)
            except (IndexError, KeyError):
                    results["term_start"].append("nan")
                
            for affiliation in affiliations:
                # Getting political parties to the set (iffy/doesnt fully work, because of the amount of information that exists on the parties.).
                try:
                    if "politicalParty" in affiliation["ref"] or "party" in affiliation["ref"] or "pp" in affiliation["ref"]:
                        parties.add(affiliation["ref"][1:])
                    else:
                        legislative_branch.add(affiliation["ref"][1:])
                except KeyError:
                    pass
                roles.add(affiliation["role"])
            
            # Setting all the roles (i.e. member, minister, head) to the list
            results["roles"].append(list(sorted(roles)))    
            
            # Now with the parties set, match their ids to get their full name and ideology and add them to the dict. 
            parties = list(sorted(parties))
            for index, party_id in enumerate(parties):
                matched_row = party_info[party_info["party_tag"] == party_id]
                try:
                    parties[index] = matched_row["full_name"].values[0]
                except (IndexError, KeyError):
                    parties[index] = "nan"
                try:
                    party_orientation.append(matched_row["party_orientation"].values[0])
                except IndexError:
                    party_orientation.append("nan")
            results["parties"].append(parties)
            results["party_orientation"].append(party_orientation)
            
            # Finding the legislative branches. As there are basic government ones that are set to be removed, finding indicies to delete and then going backwards to delete them, since deleting an index means that the code will jump ahead of certain values.
            legislative_branch = list(sorted(legislative_branch))
            indicies_to_delete = []
            for index, legislative_branch_id in enumerate(legislative_branch):
                matched_row = parliaments_info[parliaments_info["parliament_tag"] == legislative_branch_id]
                try:
                    legislative_branch[index] = matched_row["house_type"].values[0]
                except IndexError:
                    indicies_to_delete.append(index)
            
            for index in sorted(indicies_to_delete, reverse = True):
                del legislative_branch[index]
                
            results["legislative_branch"].append(legislative_branch)
            
            # Getting the full name of the individual
            forename = ' '.join([name.text for name in elem.findall('tei:persName/tei:forename', namespace)])
            surname = ' '.join([name.text for name in elem.findall('tei:persName/tei:surname', namespace)])
            results["name"].append(f"{forename}, {surname}")
        
        pd.DataFrame(results).to_csv(f"{export_path}\\{corpus}-metadata.csv", index=False)

        
# Getting the Country MetaData Information
all_corpus = ["ParlaMint-UA"]
# all_corpus = ["ParlaMint-AT", "ParlaMint-BA", "ParlaMint-BE", "ParlaMint-BG", "ParlaMint-CZ", "ParlaMint-DK", "ParlaMint-EE", "ParlaMint-ES", "ParlaMint-ES-CT", "ParlaMint-ES-GA", "ParlaMint-ES-PV", "ParlaMint-FI", "ParlaMint-FR", "ParlaMint-GR", "ParlaMint-HR", "ParlaMint-HU", "ParlaMint-IS", "ParlaMint-IT", "ParlaMint-LV", "ParlaMint-NL", "ParlaMint-NO", "ParlaMint-PL", "ParlaMint-PT", "ParlaMint-RS", "ParlaMint-SE", "ParlaMint-SI", "ParlaMint-TR", "ParlaMint-UA", "ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Files"
export_path = "D:\\ParlaMint Data\\MetaData"

full_metadata = country_metadata(all_corpus, input_path, export_path, taxonomy_base)

Getting the MetaData for corpus ParlaMint-UA at 2024-05-22 18:00:22.941241.
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.NRU', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.UNP', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.PP', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.SDPU', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.Hrom', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.PRP', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.CPSU', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.Reg', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML/1998/namespace}id': 'pp.OB', 'role': 'politicalParty'}
False
True
False
{'{http://www.w3.org/XML

In [228]:
test_set = pd.read_csv("C:\\Users\\shad4\\Downloads\\test_set.csv", sep = ";", index_col=0)
training_set = pd.read_csv("C:\\Users\\shad4\\Downloads\\training_set.csv", sep = ";", index_col=0)

test_set.to_parquet("C:\\Users\\shad4\\Downloads\\test_set.parquet", index=False)
training_set.to_parquet("C:\\Users\\shad4\\Downloads\\training_set.parquet", index=False)
training_set

Unnamed: 0,TEXT,CLASS
1,"як товар. я вважаю, що вищі посадові особи дер...",2
2,". тому ми проголосували так, щоб були всі прис...",3
3,"zadovoljni. razlika pa je v tem, v kakšni meri...",3
4,", the security of the process of calling an el...",2
5,", s sorazmerno majhnimi nakladami – primerjaln...",1
...,...,...
697,"nedopustno, če se s tem imenovanje sodnikov po...",2
698,"ministru oporekati? tisti, ki smo oporekali, s...",4
699,на місцеві вибори. це відбувається в колисці п...,3
700,kako uspešno zaključiti naloge hvala. gospod p...,3
