# Data import from the "Congreso de los Diputados" website

This process is focused on extracting the political debates that happened in the congress and tag every intervention with the name of the politician. The main objective is to create a corpora for political profiles that can be used to train an ML Transformer.

In [26]:
# installers
%pip install alive_progress

3485.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [27]:
# Imports
from html.parser import HTMLParser
import re
import pandas as pd

# constants
#page = 'https://www.congreso.es/busqueda-de-publicaciones?p_p_id=publicaciones&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_publicaciones_mode=mostrarTextoIntegro&_publicaciones_legislatura=XII&_publicaciones_id_texto=(DSCD-12-PL-4.CODI.)#(P%C3%A1gina12)'



In [28]:
# functions

"""
A simple parser to extract all the text of a publication from the body.
It removes any internal script and removes special characters with the str.strip function.
It also gets rid of pagination (Página nnn)
"""
class PublicationParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        #Initializing lists
        self.lsStartTags = list()
        self.lsEndTags = list()
        self.lsStartEndTags = list()
        self.lsComments = list()
        self.lsData=list()
        # Indicates when we are inside the body tag
        self.inBody=False
        # Marker for scripts
        self.inScript=False

    #HTML Parser Methods
    def handle_starttag(self, startTag, attrs):
        self.lsStartTags.append(startTag)
        if(startTag=="body"):
            self.inBody=True
        if(startTag=="script"):
            self.inScript=True

    def handle_endtag(self, endTag):
        self.lsEndTags.append(endTag)
        if(endTag=="body"):
            self.inBody=False
        elif(endTag=="script"):
            self.inScript=False

    def handle_startendtag(self,startendTag, attrs):
       self.lsStartEndTags.append(startendTag)

    def handle_comment(self,data):
       self.lsComments.append(data)

    def handle_data(self, data):
        if(self.inBody and not self.inScript and data!=''):
            if(not (data.startswith('Página ') or data.startswith('(Página') )):
                self.lsData.append(data.strip())

           


## Main body extractor

Gets the body and finds the start of the debate by looking at the first appearance of the word PRESIDENT*, because when someone speaks its name appears in capital letters and in the case of the chamber president the words PRESIDENTE or PRESIDENTA are used. This appearance usually indicates the start of the interventions.

In [29]:
def findStart(ls:list)->int:
    index=0
    for line in ls:
        if "PRESIDENT" in line:
            return index
        index+=1
    return -1

def getPublicationText(url):
    import urllib3
    # variables
    http = urllib3.PoolManager()
    # Get the publication
    response = http.request('GET', url)
    # Parse the publication
    parser = PublicationParser()
    parser.feed(response.data.decode('utf-8'))

    index=findStart(parser.lsData)
    if(index>=0):
        # lsData is a list of strings, so we join them all with a space
        text=' '.join(parser.lsData[index:])        
    else:
        print("Error: PRESIDENT* not found")
    return text

def get_speeches(text:str)->pd.DataFrame:
    """
    Extracts the speeches from the text of a publication.
    """

    # This regex finds the name of the politician that is speaking.
    # The name is usually in the form of:
    # [text]... ALL CAPS SURNAME (the title if president or candidate):
    # So we use this simple regex to find the next ALL CAPS that may be 
    # followed by a parenthesis and ends with a colon.
    regexfinder = r'(?:(?:[A-ZÀ-Ü,])(?:-|\s)?)+(?:\s*\((?:[A-ZÀ-Ü-a-z-à-ü]*\s?)*\))?:'

    indexes=[(m.start(0),m.end(0)) for m in re.finditer(regexfinder,text, re.U|re.M)]

    sentences=pd.DataFrame(columns=['Name','Text'])
    last=len(indexes)-1
    for i in range(len(indexes)):
        name=text[indexes[i][0]:indexes[i][1]-1]   

        firstIdx=indexes[i][1]+1
        if(i<last):
            lastIdx=indexes[i+1][0]
            while(text[lastIdx]!='.' and text[lastIdx]!=')'):
                lastIdx=lastIdx-1
                if(lastIdx==-1):
                    break
        else:
            lastIdx=len(text)
        sentence=text[firstIdx:lastIdx]
        sentences.loc[len(sentences)]=[name,sentence]
    return sentences


In [31]:
from alive_progress import alive_bar
import os

term5 = pd.read_csv('data/terms/term_5.csv')

with alive_bar(len(term5),title=f'importing Term 5',force_tty=True) as bar:
    speeches_ds = None
    for index,r in term5.iterrows():
        if os.path.isfile(f'data/pagecache/{r["term"]}/{r["fecha"]}.txt'):
            bar.text=f'c-{r["term"]}-{r["fecha"]}'
            with open(f'data/pagecache/{r["term"]}/{r["fecha"]}.txt','r') as f:
                text=f.read()
        else:
            bar.text=f'u-{r["term"]}-{r["fecha"]}'
            text=getPublicationText(r["url"])
            os.makedirs(f'data/pagecache/{r["term"]}',exist_ok=True)
            with open(f'data/pagecache/{r["term"]}/{r["fecha"]}.txt',mode='w') as f:
                f.write(text)

        bar.text=f'Parse {r["term"]}-{r["fecha"]}'
        speeches=get_speeches(text)
        speeches["Date"]=r["fecha"]
        speeches["Term"]=r["term"]
        if speeches_ds is None:
            speeches_ds=speeches
        else:
            speeches_ds=pd.concat([speeches_ds,speeches])
        bar()


importing Term 5 |⚠︎                                       | (!) 0/197 [0%] in 18.3s (0.00/s)                            


KeyboardInterrupt: 

In [None]:
speeches_ds.to_csv('data/speeches_termV.csv',index=False)

In [None]:
# term5 = pd.read_csv('data/terms/term_5.csv')

# for index,r in term5.iterrows():    
#     text=getPublicationText(r["url"])
#     speeches=get_speeches(text)
#     if(index>1):
#         break

# speeches.head()