[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jonasengelmann/erinnerungsluecken-im-nsu-untersuchungsausschuss/blob/master/Scraping_and_parsing_of_transcripts.ipynb)

In [None]:
import re
import subprocess
import pickle
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path

In [0]:
# Create folder structure:
pdf_folder = Path.cwd() / '01_pdfs'
Path.mkdir(pdf_folder, exist_ok=True)

scarping_result_folder = Path.cwd() / '02_scraping_result'
Path.mkdir(scarping_result_folder, exist_ok=True)

parsing_result_folder = Path.cwd() / '03_parsing_result'
Path.mkdir(parsing_result_folder, exist_ok=True)

## 1. Download transcripts as PDFs

I selected all urls to the transcription files which contain witness interrogations below. You can find all transcription files on the [Bundestag's website](http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/).

In [0]:
urls = ['http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2012.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2014.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2015.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2017.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2019.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2021.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2022a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2022b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2024a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2024b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2027.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2029a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2029b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2031.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2032.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2034a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2034b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2036.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2039.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2041.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2043.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2044.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2047.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2049a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2049b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2051.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2053.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2054.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2056a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2056b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2057.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2059a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2059b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2060.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2062.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2064a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2064b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2065.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2066a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2066b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2068a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2068b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2070a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2070b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2072a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2072b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2074.pdf']


In [0]:
# Let's download them:
for url in urls:
    print(f'Downloading {url}')
    urllib.request.urlretrieve(url, pdf_folder / url.split("/")[-1])

## 2. Scraping with pdfminer

To extract the content of the PDFs as text, we need to scrape them first. There are different scraping tools available, I decided to go with pdfminer, since it has the possibility to preserve most of the layout information by just generating an xml representation of each page. We want to preserve some of the layout information as it hepls us to easier differeniate between speakers, footnotes, quotes and so on. 


In [0]:
# For python2 use: pip install pdfminer
!pip install pdfminer.six

In [0]:
%%time

# Scraping all pdfs might take a while, it took me around 10 mins. 

for filename in list(pdf_folder.glob('*.pdf')):
    target = scarping_result_folder / f'{filename.stem}.xml'
    print(f'Scraping {filename}')
    subprocess.call([
        'pdf2txt.py',
        '-t',
        'xml',
        '-n',
        '-o',
        target,
        filename,
    ])

## 3. Parsing XMLs

In [0]:
def crop_vertically(document, crop_percentage):
    '''
    Removes text elements at the top and bottom of 
    the document given by percentage of the page size.
    '''
    crop_document = []
    for page in document:
        crop_page = []
        y1 = float(page.attrib['bbox'].split(',')[3])
        for char in page:
            if (char.tag == 'text' 
               and float(char.attrib['bbox'].split(',')[3]) > y1 * crop_percentage
               and float(char.attrib['bbox'].split(',')[3]) < y1 * (1 - crop_percentage)):
                
                crop_page.append(char)
        crop_document.append(crop_page)
    return crop_document


def check_if_characters_match_style(n_characters, font, size):
    '''
    Checks if all characters match a specified font and
    font size. n_characters has to be a list of text elements. 
    '''
    checks = []
    for single_char in n_characters:
        if (font in single_char.attrib['font'].lower()
           and single_char.attrib['size'].startswith(size)):
            checks.append(True)
        elif single_char.text.strip():
            checks.append(True)
    return len(checks) == len(n_characters)


def clean_text(text):
    '''
    Removes double space and hyphens resulting from linebreaks.
    '''
    text = re.sub(' +', ' ', text)
    return re.sub(r'(\w)- *(\w)', '\\1\\2', text, re.U)
  

def find_next_speaker_and_text(characters):
    '''
    Finds the next speaker and text on the basis that speaker are always
    written with bold font and in font size 9.
    '''
    speaker, text = [], []
    record_speaker = False
    for idx, char in enumerate(characters):

        # Check if next 10 characters are bold and in font size 9:
        if (check_if_characters_match_style(characters[idx:idx+10], 'bold', '9')
            and not record_speaker):

            yield ''.join(speaker), clean_text(''.join(text))
            
            record_speaker = True
            speaker = []

        if record_speaker:
            if char.attrib['size'].strip().startswith('9'):
                speaker.append(char.text)
        
            next_char = characters[idx+1] if (idx+1) != len(characters) else char

            # Check if it is the end of the speaker's name:
            if not 'bold' in next_char.attrib['font'].lower() and next_char.text.strip():
                record_speaker = False
                text = []

        elif char.attrib['size'].strip().startswith('9'):
            text.append(char.text)




protocol = []
for filename in sorted(list(scarping_result_folder.glob('*.xml'))):

    document = ET.parse(filename).getroot()

    # Crop bottom and top by 7 procent to discard of headers and footers
    document = crop_vertically(document, crop_percentage=0.07)

    # Collect all text characters
    text_characters = []
    for page in document:
        text_characters += [char for char in page if char.tag == 'text']

    # Parse content of XMLs
    for speaker, text in find_next_speaker_and_text(text_characters):
        if speaker.strip():
            protocol.append((speaker, text))

In [0]:
# Save extracted data:
with open(parsing_result_folder / 'parsed_dialog.txt', 'w') as output:
    for speaker, text in protocol:
        output.write(f'{speaker.strip()} {text.strip()}\n\n')

pickle.dump(protocol, open(parsing_result_folder / 'parsed_dialog.p', 'wb'))

In [0]:
# In Colab you can download the parsed_dialog file like this:
from google.colab import files
files.download(parsing_result_folder / 'parsed_dialog.txt')