In [1]:
import os
import pandas as pd 
import json 

from src.textual_features import exact_keyword_search

# Exact keyword search for paragraphs from BauNVO & BauGB

## Prepare data

- Change the folder path in the code block below to read in the data.
- Specify the relevant column names. The function that is used in the following expects the input data frame to have (at least) two columns, i.e., one id and one content column. Here, the columns are called filename and content. If named differently, change the column names in the code below.


In [4]:
# specify file path
INPUT_FILE_PATH = os.path.join("data", "proc", "building_plans", "bp_text.json")
OUTPUT_FILE_PATH = os.path.join("data", "nrw", "bplan", "features", "keywords", "exact_search", "exact_search.csv")

# specify relevant column names
ID_COLUMN='filename'
TEXT_COLUMN='content'

# read in data
input_df = pd.read_json(INPUT_FILE_PATH)


In [5]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1946 non-null   object
 1   content   1063 non-null   object
 2   metadata  1886 non-null   object
dtypes: object(3)
memory usage: 45.7+ KB


In [6]:
input_df

Unnamed: 0,filename,content,metadata
0,10.pdf,,"{'Content-Type': ['application/pdf', 'image/ti..."
1,10131.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': 'KrapM', 'Content-Type': ['applicat..."
2,10197.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
3,10272.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': ['application/pdf', 'image/jp..."
4,10273.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': ['application/pdf', 'image/jp..."
...,...,...,...
1941,961.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': 'RoGeb', 'Content-Type': ['applicat..."
1942,9709.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Content-Type': 'application/pdf', 'Creation-..."
1943,9738.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': '', 'Content-Type': ['application/p..."
1944,9810.pdf,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'Author': 'ROWE', 'Content-Type': ['applicati..."


## Define keyword dictionary

Keywords are specified in a separate json file to apply the exact keyword search more easily to different sets of keywords, simply by reading in the relevant dictionary. The dictionary is structured so that each keyword category (e.g. baunvo-1) can contain one or more keywords to consider the category covered (e.g., "§1 baunvo", "1 baunvo", or "allgemeine vorschriften für bauflächen und baugebiete").


In [7]:
with open('dictionaries/keyword_dict_baunvo.json') as f:
    BAUNVO_KEYWORDS = json.load(f)

## Apply function

Exact keyword matching based on input dictionary, returns df showing which keyword appeared in each pdf per category.

In [8]:
result_df = exact_keyword_search.search_df_for_keywords(input_df=input_df,
                                   text_column_name=TEXT_COLUMN,
                                   id_column_name=ID_COLUMN,
                                   keyword_dict=BAUNVO_KEYWORDS)

We can explore the results by seeing the output dataframe:

In [9]:
result_df.head()

Unnamed: 0,filename,baunvo-1,baunvo-2,baunvo-3,baunvo-4,baunvo-4a,baunvo-5,baunvo-5a,baunvo-6,baunvo-6a,...,baunvo-14,baunvo-15,baunvo-16,baunvo-17,baunvo-18,baunvo-19,baunvo-20,baunvo-21,baunvo-21a,13b
0,10.pdf,,,,,,,,,,...,,,,,,,,,,
1,10131.pdf,,,,,,,,,,...,,,,,,,,,,
2,10197.pdf,,,,,,,,,,...,,,,,,,,,,
3,10272.pdf,,,,,,,,,,...,,,,,,,,,,
4,10273.pdf,,,,,,,,,,...,,,,,,,,,,


And inspect keyword coverage across all files.

In [10]:
result_df.count()

filename      1946
baunvo-1       247
baunvo-2       248
baunvo-3       248
baunvo-4       250
baunvo-4a       49
baunvo-5       249
baunvo-5a       15
baunvo-6       248
baunvo-6a       28
baunvo-7       233
baunvo-8       237
baunvo-9       235
baunvo-10      228
baunvo-11      219
baunvo-12      241
baunvo-13      193
baunvo-13a      62
baunvo-14      202
baunvo-15      206
baunvo-16      196
baunvo-17      175
baunvo-18      186
baunvo-19      262
baunvo-20      270
baunvo-21      156
baunvo-21a     219
13b             25
dtype: int64