In [1]:
import os
import pandas as pd 
import json 

from src.textual_features import exact_keyword_search

# Exact keyword search for paragraphs from BauNVO & BauGB

## Prepare data

- Change the folder path in the code block below to read in the data.
- Specify the relevant column names. The function that is used in the following expects the input data frame to have (at least) two columns, i.e., one id and one content column. Here, the columns are called filename and content. If named differently, change the column names in the code below.


In [2]:
# specify file path
INPUT_FILE_PATH = os.path.join("data", "proc", "building_plans", "bp_text.json")
OUTPUT_FILE_PATH = os.path.join("data", "nrw", "bplan", "features", "keywords", "exact_search", "exact_search.csv")

# specify relevant column names
ID_COLUMN='filename'
TEXT_COLUMN='content'

# read in data
input_df = pd.read_json(INPUT_FILE_PATH)


## Define keyword dictionary

Keywords are specified in a separate json file to apply the exact keyword search more easily to different sets of keywords, simply by reading in the relevant dictionary. The dictionary is structured so that each keyword category (e.g. baunvo-1) can contain one or more keywords to consider the category covered (e.g., "§1 baunvo", "1 baunvo", or "allgemeine vorschriften für bauflächen und baugebiete").


In [3]:
with open('dictionaries/keyword_dict_baunvo.json') as f:
    BAUNVO_KEYWORDS = json.load(f)


## Apply function

Exact keyword matching based on input dictionary, returns df showing which keyword appeared in each pdf per category.

In [4]:
result_df = exact_keyword_search.search_df_for_keywords(input_df=input_df,
                                   text_column_name=TEXT_COLUMN,
                                   id_column_name=ID_COLUMN,
                                   keyword_dict=BAUNVO_KEYWORDS)

We can explore the results by seeing the output dataframe:

In [5]:
result_df.head()

Unnamed: 0,filename,baunvo-1,baunvo-2,baunvo-3,baunvo-4,baunvo-4a,baunvo-5,baunvo-5a,baunvo-6,baunvo-6a,...,baunvo-14,baunvo-15,baunvo-16,baunvo-17,baunvo-18,baunvo-19,baunvo-20,baunvo-21,baunvo-21a,13b
0,10.pdf,,,,,,,,,,...,,,,,,,,,,
1,10131.pdf,,,,,,,,,,...,,,,,,,,,,
2,10197.pdf,,,,,,,,,,...,,,,,,,,,,
3,10272.pdf,,,,,,,,,,...,,,,,,,,,,
4,10273.pdf,,,,,,,,,,...,,,,,,,,,,


And inspect keyword coverage across all files.

In [6]:
result_df.count()

filename      580
baunvo-1       32
baunvo-2       32
baunvo-3       33
baunvo-4       36
baunvo-4a       4
baunvo-5       32
baunvo-5a       3
baunvo-6       32
baunvo-6a       2
baunvo-7       32
baunvo-8       33
baunvo-9       32
baunvo-10      31
baunvo-11      31
baunvo-12      34
baunvo-13      22
baunvo-13a     10
baunvo-14      29
baunvo-15      28
baunvo-16      27
baunvo-17      22
baunvo-18      29
baunvo-19      38
baunvo-20      43
baunvo-21      26
baunvo-21a     34
13b             5
dtype: int64

In [9]:
input_df.dropna(subset = ['content']) 

Unnamed: 0,filename,content,metadata
1,10131.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': 'KrapM', 'Content-Type': 'applicati..."
2,10197.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
3,10272.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
4,10273.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
6,10277.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
...,...,...,...
574,9570.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
575,961.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': 'RoGeb', 'Content-Type': 'applicati..."
576,9709.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
577,9738.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': '', 'Content-Type': 'application/pd..."
