In [10]:
import io
import re
from collections import Counter

import mysql.connector
import pdfplumber
import requests
from spacy import load

In [18]:
def get_text_from_pdf(url: str):
    """Extract text from a pdf from a given url. Returns a string"""

    response = io.BytesIO(requests.get(url).content)
    return get_text_from_bytes(response)

def get_text_from_bytes(bytes: io.BytesIO):
    """Extract text from a pdf in a BytesIO format. Returns a string"""

    text = ''

    with pdfplumber.open(bytes) as pdf:
        for page in pdf.pages:
            #print(page.page_number)
            text += page.extract_text()
    return text

def get_speech_positions(text: str):
    """Gets all positions of speeches in the text. A speech is defined as the text between two time specifications."""
    matches = re.finditer('[0-2]?[0-9]\.[0-5][0-9] \n', text)
    matches_list = [item for item in matches]
    # TODO: muss da -1?
    positions = [(matches_list[idx].end(), matches_list[idx + 1].start()) for idx in range(0, len(matches_list) - 1, 2)]
    return positions

def prepare_text(text: str, positions: list((int, int))):
    all_text = ''
    # Load spacy model for stopwords
    nlp = load('de_core_news_sm')
    for pos in positions:
        # Get speech between the specified time
        speech = text[pos[0]:pos[1]]

        # Remove headers
        speech = re.sub(' \d*? .* Nationalrat, ?.* ?\n?.*', '', speech)
        speech = re.sub('Nationalrat, ?.* ?\n?.*', '', speech)

        # Merge words separated by a line-break
        speech = re.sub('- ?\n', '', speech)
        speech = re.sub('\n', '', speech)

        # Remove the beginnging of the string e.g. "Abgeordneter Jon Doe (Blue party):"
        speech = re.sub('Abgeordneter \D*: ', '', speech)
        speech = re.sub('Abgeordnete \D*: ', '', speech)

        # Remove stop words
        result = [word for word in speech.split() if word not in nlp.Defaults.stop_words]
        speech = ' '.join(result)

        # Remove punctuation
        speech = re.sub(r'[^\w\s]+','', speech)

        all_text += speech

    return all_text

In [19]:
url = 'https://www.parlament.gv.at/PAKT/VHG/XXVII/NRSITZ/NRSITZ_00113/fname_1009194.pdf'

text = get_text_from_pdf(url)
positions = get_speech_positions(text)
all_text = prepare_text(text, positions)


In [12]:
db = mysql.connector.connect(
    host="localhost",
    user="htl",
    password="insy",
    database='politics'
)

cursor = db.cursor()

In [None]:
#cursor.execute('CREATE TABLE WORD_COUNTS (WORD VARCHAR(255), COUNT INTEGER)')

In [21]:
cursor.execute('TRUNCATE TABLE WORD_COUNTS')
db.commit()

In [22]:
c = Counter(all_text.split())
#list(c.items())

In [23]:
sql = 'INSERT INTO WORD_COUNTS (WORD, COUNT) VALUES (%s, %s)'
values = list(c.items())

cursor.executemany(sql, values)

db.commit()

print(cursor.rowcount, "lines were inserted")

6964 lines were inserted
