## Setup

In [9]:
import pandas
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime
import re

# Baseline namespace information
namespace = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace", "XInclude": "http://www.w3.org/2001/XInclude"}

base_legislative_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-taxonomy-parla.legislature.xml"
person_path = "D:\\ParlaMint Data\\Files\\ParlaMint-GB\\ParlaMint-GB.TEI\\ParlaMint-GB-listPerson.xml"
org_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listOrg.xml"


## Getting Base Taxonomy Files

In [200]:
all_corpus = ["ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Files"
taxonomy_base = "D:\\ParlaMint Data\\Taxonomy"
output_path = "C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files"
#f"{input_path}\\{corpus}-RawSpeechesParquet.gzip"
        
def get_political_orientation_taxonomy(input_path):
    """"Given an input_path to the ParlaMint xml political orientation taxonomy file, find all instances of party orientation and codes and export it as a csv.
    :param input_path: File import path.
    :param export_path: Where to output the path."""
    root = ET.parse(f"{input_path}\\ParlaMint-taxonomy-politicalOrientation.xml").getroot()
    categories = []
    for elem in root.findall("tei:category", namespace):
        categories.append(
            [elem.attrib[f"{{{namespace['xml']}}}id"], elem.find("tei:catDesc/tei:term", namespace).text]
        )
        party_orientation_taxonomy = pandas.DataFrame(categories, columns = ["parla_tag", "party_orientation"])
        #legislature_taxonomy.to_csv(f"{export_path}\\party_orientation_taxonomy.csv", index=False)
    return party_orientation_taxonomy


def get_legislature_taxonomy(input_path):
    """"Given an input_path to the ParlaMint xml legislature taxonomy file, find all instances of house categories and codes and export it as a csv.
    :param input_path: File import path.
    :param export_path: Where to output the path."""
    root = ET.parse(f"{input_path}\\ParlaMint-taxonomy-parla.legislature.xml").getroot()
    for elem in root.findall(".//tei:category[@xml:id='parla.organization']", namespace):
        categories = [
            [category.attrib[f"{{{namespace['xml']}}}id"], category.find("tei:catDesc/tei:term", namespace).text] for category in elem.findall("tei:category/tei:category", namespace) + elem.findall("tei:category/tei:category/tei:category", namespace)
        ]
        legislature_taxonomy = pandas.DataFrame(categories, columns = ["parla_tag", "house_type"])
        return legislature_taxonomy

        
#get_political_orientation_taxonomy(taxonomy_base, output_path)

## Finding Country Metadata

In [225]:
input_path = "D:\\ParlaMint Data\\Files"
#party_orientation_taxonomy = pd.read_csv("C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files\\party_orientation_taxonomy.csv")
#pd.read_csv("C:\\Users\\shad4\\PycharmProjects\\ParliamentHackathon2024\\datawrangling\\files\\legislature_taxonomy.csv")

party_orientation_taxonomy = get_political_orientation_taxonomy(taxonomy_base)
legislature_taxonomy = get_legislature_taxonomy(taxonomy_base)
def get_corpus_party_info(corpus, input_path, party_taxonomy, legislature_taxonomy):
    """"Given an input_path to the ParlaMint xml political orientation taxonomy file, find all instances of party orientation and codes and export it as a csv.
    :param input_path: File import path.
    :param export_path: Where to output the path."""
    root = ET.parse(f"{input_path}\\{corpus}\\{corpus}.TEI\\{corpus}-listOrg.xml").getroot()
    categories = []
    parties = {"party_tag": [], "full_name": [], "abv_name": [], "parla_tag": []}
    parliaments = {"parliament_tag": [], "house_type": []}
    for elem in root.findall("tei:org", namespace):
        if elem.attrib["role"] == "parliament":
            parliaments["parliament_tag"].append(elem.attrib[f"{{{namespace['xml']}}}id"])
            split_identifiers = elem.attrib["ana"].replace("#","").split()
            for identifier in split_identifiers:
                try:
                    matched_row = legislature_taxonomy[legislature_taxonomy["parla_tag"] == identifier]
                    parliaments["house_type"].append(matched_row["house_type"].values[0])
                except IndexError:
                    pass
        elif elem.attrib["role"] == "politicalParty" or elem.attrib["role"] == "parliamentaryGroup":
            parties["party_tag"].append(elem.attrib[f"{{{namespace['xml']}}}id"])
            parties["full_name"].append(elem.find(".//tei:orgName[@full='yes']", namespace).text)
            try:
                parties["abv_name"].append(elem.find(".//tei:orgName[@full='abb']", namespace).text)
            except AttributeError:
                parties["abv_name"].append("nan")
            try:
                parties["parla_tag"].append(elem.find(".//tei:state[@type='politicalOrientation']/tei:state", namespace).attrib["ana"][1:])
            except AttributeError:
                parties["parla_tag"].append("nan")
        else:
            pass
    # print(len(parties["party_tag"]))
    # print(len(parties["full_name"]))
    # print(len(parties["abv_name"]))
    # print(len(parties["parla_tag"]))

    party_info = pandas.DataFrame(parties).merge(party_taxonomy, how='left').drop(columns=["parla_tag"])
    parliaments_info = pandas.DataFrame(parliaments)
    return party_info, parliaments_info
        
party_info, parliaments_info = get_corpus_party_info("ParlaMint-GR", input_path, party_orientation_taxonomy, legislature_taxonomy)
display(party_info)
display(parliaments_info)

Unnamed: 0,party_tag,full_name,abv_name,party_orientation
0,party.ΝΔ,Νέα Δημοκρατία,Ν.Δ.,Centre-right
1,party.ΑΝΕΛ,Ανεξάρτητοι έλληνες εθνική πατριωτική δημοκρατ...,ΑΝ.ΕΛ.,Right
2,party.ΔΗΣΥ,Δημοκρατική Συμπαράταξη,ΔΗ.ΣΥ.,Centre-left
3,party.Ε_Λ,Ελληνική Λύση,Ε.Λ.,Right to far-right
4,party.Ε_Κ,Ένωση Κεντρώων,Ε.Κ.,Centre
5,party.ΚΙΝΑΛ,Κίνημα Αλλαγής,ΚΙΝ.ΑΛ.,Centre-left
6,party.KKE,Κομμουνιστικό Κόμμα Ελλάδος,Κ.Κ.Ε.,
7,party.ΛΑ_Ε,Λαϊκή Ενότητα,ΛΑ.Ε.,Left to far-left
8,party.Χ_Α,Λαϊκός Σύνδεσμος - Χρυσή Αυγή,Χ.Α.,Far-right
9,party.ΜέΡΑ25,Μέτωπο Ευρωπαϊκής Ρεαλιστικής Ανυπακοής,ΜέΡΑ25,Left


Unnamed: 0,parliament_tag,house_type
0,PoGR,Unicameralism


In [226]:
taxonomy_base = "D:\\ParlaMint Data\\Taxonomy"
def country_metadata(corpus_list, input_path, export_path, taxonomy_path):
    legislature_taxonomy = get_legislature_taxonomy(taxonomy_base)
    political_orientation_taxonomy = get_political_orientation_taxonomy(taxonomy_base)
    for corpus in corpus_list:
        print(f"Getting the MetaData for corpus {corpus} at {datetime.datetime.now()}.")
        party_info, parliaments_info = get_corpus_party_info(corpus, input_path, political_orientation_taxonomy, legislature_taxonomy)
        
        person_path = f"{input_path}\\{corpus}\\{corpus}.TEI\\{corpus}-listPerson.xml"
        root = ET.parse(person_path).getroot()        
        
        results = {"name_id": [], "name": [], "gender": [], "term_start": [], "roles": [], "parties": [], "party_orientation": [], "legislative_branch": []}
        
        count = 0
        for elem in root.findall("tei:person", namespace):
            # Getting name_ids
            results["name_id"].append(f"GB-{elem.attrib[f"{{{namespace['xml']}}}id"]}")
            #print(name_id)
            
            # Getting gender
            try:
                results["gender"].append([gen.attrib["value"] for gen in elem.findall("tei:sex", namespace)][0])
            except IndexError:
                results["gender"].append("nan")
            
            # Getting affiliations (has party, member role, year)
            affiliations = [affil.attrib for affil in elem.findall("tei:affiliation", namespace)]
            roles = set()
            parties = set()
            legislative_branch = set()
            party_orientation = []
            
            try:
                results["term_start"].append(affiliations[0]["from"])
            except (IndexError, KeyError):
                results["term_start"].append("nan")
                
            for affiliation in affiliations:
                try:
                    if "party" in affiliation["ref"]:
                        parties.add(affiliation["ref"][1:])
                    else:
                        legislative_branch.add(affiliation["ref"][1:])
                except KeyError:
                    pass
                roles.add(affiliation["role"])
                
            if "head" in roles:
                results["roles"].append("head")
            elif "minister" in roles:
                results["roles"].append("minister")
            elif "member" in roles:
                results["roles"].append("member")
            else:
                results["roles"].append("nan")
            # results["roles"].append(sorted(roles))
            
            parties = list(sorted(parties))
            for index, party_id in enumerate(parties):
                matched_row = party_info[party_info["party_tag"] == party_id]
                try:
                    parties[index] = matched_row["full_name"].values[0]
                except (IndexError, KeyError):
                    parties[index] = "nan"
                try:
                    party_orientation.append(matched_row["party_orientation"].values[0])
                except IndexError:
                    party_orientation.append("nan")
            results["parties"].append(parties)
            results["party_orientation"].append(party_orientation)
            
            legislative_branch = list(sorted(legislative_branch))
            indicies_to_delete = []
            for index, legislative_branch_id in enumerate(legislative_branch):
                matched_row = parliaments_info[parliaments_info["parliament_tag"] == legislative_branch_id]
                try:
                    legislative_branch[index] = matched_row["house_type"].values[0]
                except IndexError:
                    indicies_to_delete.append(index)
            
            for index in sorted(indicies_to_delete, reverse = True):
                del legislative_branch[index]
                
            results["legislative_branch"].append(legislative_branch)

            
            # Getting the full name of the individual
            forename = ' '.join([name.text for name in elem.findall('tei:persName/tei:forename', namespace)])
            surname = ' '.join([name.text for name in elem.findall('tei:persName/tei:surname', namespace)])
            results["name"].append(f"{forename}, {surname}")
            #print(results)
        
        pd.DataFrame(results).to_csv(f"{export_path}\\{corpus}-metadata.csv", index=False)

        
# Getting the Country MetaData Information
#all_corpus = ["ParlaMint-GR"]
all_corpus = ["ParlaMint-AT", "ParlaMint-BA", "ParlaMint-BE", "ParlaMint-BG", "ParlaMint-CZ", "ParlaMint-DK", "ParlaMint-EE", "ParlaMint-ES", "ParlaMint-ES-CT", "ParlaMint-ES-GA", "ParlaMint-ES-PV", "ParlaMint-FI", "ParlaMint-FR", "ParlaMint-GR", "ParlaMint-HR", "ParlaMint-HU", "ParlaMint-IS", "ParlaMint-IT", "ParlaMint-LV", "ParlaMint-NL", "ParlaMint-NO", "ParlaMint-PL", "ParlaMint-PT", "ParlaMint-RS", "ParlaMint-SE", "ParlaMint-SI", "ParlaMint-TR", "ParlaMint-UA", "ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Files"
export_path = "D:\\ParlaMint Data\\MetaData"

full_metadata = country_metadata(all_corpus, input_path, export_path, taxonomy_base)

Getting the MetaData for corpus ParlaMint-AT at 2024-05-22 16:53:15.803046.
Getting the MetaData for corpus ParlaMint-BA at 2024-05-22 16:53:17.185577.
Getting the MetaData for corpus ParlaMint-BE at 2024-05-22 16:53:17.539492.
Getting the MetaData for corpus ParlaMint-BG at 2024-05-22 16:53:18.187573.
Getting the MetaData for corpus ParlaMint-CZ at 2024-05-22 16:53:19.366351.
Getting the MetaData for corpus ParlaMint-DK at 2024-05-22 16:53:23.552652.
Getting the MetaData for corpus ParlaMint-EE at 2024-05-22 16:53:23.972862.
Getting the MetaData for corpus ParlaMint-ES at 2024-05-22 16:53:24.237659.
Getting the MetaData for corpus ParlaMint-ES-CT at 2024-05-22 16:53:25.044197.
Getting the MetaData for corpus ParlaMint-ES-GA at 2024-05-22 16:53:25.550425.
Getting the MetaData for corpus ParlaMint-ES-PV at 2024-05-22 16:53:25.766046.
Getting the MetaData for corpus ParlaMint-FI at 2024-05-22 16:53:26.308479.
Getting the MetaData for corpus ParlaMint-FR at 2024-05-22 16:53:26.669939.
Get

In [228]:
test_set = pd.read_csv("C:\\Users\\shad4\\Downloads\\test_set.csv", sep = ";", index_col=0)
training_set = pd.read_csv("C:\\Users\\shad4\\Downloads\\training_set.csv", sep = ";", index_col=0)

test_set.to_parquet("C:\\Users\\shad4\\Downloads\\test_set.parquet", index=False)
training_set.to_parquet("C:\\Users\\shad4\\Downloads\\training_set.parquet", index=False)
training_set

Unnamed: 0,TEXT,CLASS
1,"як товар. я вважаю, що вищі посадові особи дер...",2
2,". тому ми проголосували так, щоб були всі прис...",3
3,"zadovoljni. razlika pa je v tem, v kakšni meri...",3
4,", the security of the process of calling an el...",2
5,", s sorazmerno majhnimi nakladami – primerjaln...",1
...,...,...
697,"nedopustno, če se s tem imenovanje sodnikov po...",2
698,"ministru oporekati? tisti, ki smo oporekali, s...",4
699,на місцеві вибори. це відбувається в колисці п...,3
700,kako uspešno zaključiti naloge hvala. gospod p...,3
