In [1]:
import pandas as pd
import numpy as np
import json

from pdfs_to_text import pdfs_downloader, pdfs_preprocessing

# Downloading PDFs

In the first notebooks, we obtained the information about building plans and the links to the respective PDFs. In this notebook, you will see how to use the function that takes as input that metadata and downloads all PDFs.

First, read the metadata:

In [2]:
filename = '../data/raw/geoservices_results/formatted_map_data_section_A.jsonl'

In [3]:
data = []
buffer = ""
with open(filename, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # Add line to buffer
            buffer += line
            # Check if this is the end of a JSON object
            if line.endswith("}"):
                try:
                    data.append(json.loads(buffer))
                    buffer = ""  # Reset buffer after successful parse
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON object: {buffer}")
                    print(f"Error: {e}")
                    buffer = ""  # Reset buffer to skip the problematic object


In [4]:
data = pd.DataFrame(data)

In [5]:
data.Planart.unique()

array(['Qualifizierter BPlan', 'Entwicklungssatzung', nan,
       'Einfacher BPlan', 'Vorhabenbezogener BPlan', 'Ergänzungssatzung',
       'BPlan_Innenentwicklung'], dtype=object)

In [6]:
np.random.seed(seed=912)
data['id'] = np.random.randint(1, 944, size=len(data))

In [7]:
bplan_data = data.loc[(data['Planart'] == 'Qualifizierter BPlan') | (data['Planart'] == 'Einfacher BPlan') | (data['Planart'] == 'Vorhabenbezogener BPlan')]

- Adjust `id_column` with the name of the ID column.
- Adjust `link_column` with the name of the column that contains the links.
- Adjust `date_column` to the column with date of the building plans.
- Adjust `output_folder` with name of the folder you want to save the data to.

The function also contains the optional parameter `sample_n` which can be used to only download a sample, defining the number of observations to take.

In [8]:
pdfs_downloader.run_pdf_downloader(input_df = bplan_data,
    id_column = 'id',
    link_column = 'URL zur Legende',
    date_column = 'Datum des Inkrafttretens',
    start_date = '2001-01-01',
    end_date = '2024-01-01',
    output_folder = "../data/raw/building_plan_sample/pdfs",
    sample_n = 100)

  return pd.to_datetime(date_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[date_column] = data[date_column].apply(parse_date)
100%|██████████| 100/100 [02:24<00:00,  1.45s/it]


Then, we run the function run_pdfs_split that converts pdfs into jpg for the OCR.

In [10]:
pdfs_preprocessing.run_pdfs_split(input_folder='../data/raw/building_plan_sample/pdfs',
                                  output_folder= '../data/proc/building_plans_sample/split_pdf/')

100%|██████████| 81/81 [02:05<00:00,  1.55s/it]
