# Download protocols via Bundestag API

API documentation: https://dip.bundestag.de/%C3%BCber-dip/hilfe/api

Unfortunately, the XML in the API response is limited (just plain text) compared to the XML files that can be [manually downloaded](https://www.bundestag.de/services/opendata). The team [mentioned having fixed this issue](https://dip.bundestag.de/%C3%BCber-dip/neu#content) in an update in December 2023 but it does not seem to be available yet. Might be worth contacting them about it.

In [None]:
import requests

# xml libraries
import xml.etree.ElementTree as ET

In [None]:
def download_protocols(start_date, end_date, verbose=False, data_dir = "data/protocols", format="xml"):
    request_url = "https://search.dip.bundestag.de/api/v1/plenarprotokoll-text"
    api_key = "I9FKdCn.hbfefNWCY336dL6x62vfwNKpoN2RZ1gp21"
    cursor = "*"
    previous_cursor = None

    # Keep requesting data until the cursor is the same as the previous cursor
    while cursor != previous_cursor:
        
        previous_cursor = cursor
        
        if verbose:
            print(f"Requesting data with cursor {cursor}.")
        
        response = requests.get(request_url,
                                params = {"apikey": api_key,
                                          "f.zuordnung": "BT",
                                          "f.datum.start": start_date,
                                          "f.datum.end": end_date,
                                          "cursor": cursor,
                                          "format": "xml"})

        if verbose:
            print(f"status {response.status_code}: {response.url}")

        if response.status_code == 200:
            # Convert the response to XML
            xml_doc = ET.fromstring(response.content)
            
            for doc in xml_doc.findall(".//document"):
                # Get the document number
                doc_number = doc.find("dokumentnummer").text
                # Replace "/" with "-" in the document number to make it safe for file name
                safe_doc_number = doc_number.replace("/", "-")

                if format == "xml":
                    # Save the document as XML file
                    with open(f'{data_dir}/xml/protocol_{safe_doc_number}.xml', 'w') as file:
                        file.write(ET.tostring(doc, encoding="unicode"))
                    if verbose:
                        print(f"XML file saved for protocol {safe_doc_number}.")
                elif format == "txt":
                    # Save the document as text file
                    with open(f'{data_dir}/txt/protocol_{safe_doc_number}.txt', 'w') as file:
                        file.write(doc.find("text").text)
                    if verbose:
                        print(f"Text file saved for protocol {safe_doc_number}.")
                else:
                    print("Invalid format. Please use 'xml' or 'txt'.")
                    break
                
            # Update the cursor
            cursor = xml_doc.find(".//cursor").text
        else:
            print("Failed to fetch data. Status code:", response.status_code)
            break


In [None]:
download_protocols("2021-10-26",
                   "2024-05-29",
                   format="txt",
                   verbose=True)