In [1]:
import pandas as pd 
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
import time
from tqdm import tqdm 
import random

- el punto mas a la izquierda: 4289024.0,5534720.0,4291072.0,5536768.0
- el punto mas a la derecha: 4634304.0,5388480.0,4634560.0,5388736.0

- el punto mas abajo: 4369408.0,5268480.0,4371456.0,5270528.0
- el punto mas arriba: 4634304.0,5388480.0,4634560.0,5388736.0

In [2]:
def parse_url(url):
    
    response = requests.get(url)
    
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')

    return(soup)

def scrape_all_table_rows(soup):
    '''
    

    '''

    table = soup.find('table')

    rows = []

    for tr in table.find_all('tr'):

        row = [td.text for td in tr.find_all('td')]
        
        rows.append(row)

    return(rows)

def split_list_by_value(lst, 
                        value = 'Auskunft Bauleitplanung (Bebauungsplan)'):
    
    '''
    Takes as input a list and splits into sub-lists using as separator the defined value. 

    Args:
        lst : list that will be separated
        value : value to use as separator 

    Returns: 
        sublists : list of lists with the values separated 
    '''
    sublists = []
    temp_list = []

    for item in lst:
        if item == [value]:
            if temp_list:
                sublists.append(temp_list)
                temp_list = []
        else:
            temp_list.append(item)

    if temp_list:
        sublists.append(temp_list)

    return sublists

def convert_results_to_dictionary(result):
    '''
    Takes as input a list of lists with two values. Takes each element of the list and appends it to a dictionary as a key-value pair. 

    Args:
        result : list which contains two values per element. 

    Returns: 
        result_dict : list converted to dictionary. 
    '''

    result_dict = {}

    for inner_list in result:

        key = inner_list[0]
        value = inner_list[1]

        result_dict[key] = value

    return(result_dict)

def convert_list_of_results_to_dataframe(list_of_rows):
    '''
    Takes as input a list with dictionaries, maps the function convert_results_to_dictionary() on each element. Creates dataframe from those dictionaries. 

    Args:
        list_of_rows : list of lists with two values which will be mapped as dict. 
    
    Returns: 
        result_data : dataframe. 
    '''

    result_dictionary = [convert_results_to_dictionary(item) for item in list_of_rows]

    result_data = pd.DataFrame(result_dictionary)

    return(result_data)

def scrape_geoservices_api(url):

    parsed_site = parse_url(url)

    rows_list = scrape_all_table_rows(parsed_site)

    scraped_rows = split_list_by_value(rows_list, 'Auskunft Bauleitplanung (Bebauungsplan)')

    data = convert_list_of_results_to_dataframe(scraped_rows)

    return(data)

In [3]:
def generate_ranges(bbox):
    '''
    Takes as input coordinates of a bounding box and yields all combinations of 1000x1000 ranges within that box. 

    Arguments:
        bbox : a series of coordinates in a tuple. 

    Returns:
        Generator object with sub-boxes. 
    '''
    min_x, min_y, max_x, max_y = bbox
    step = 1000
    
    for x in range(int(min_x), int(max_x), step):
        for y in range(int(min_y), int(max_y), step):
            x_end = min(x + step, int(max_x))
            y_end = min(y + step, int(max_y))
            yield (x, y, x_end, y_end)

# Given bounding box
bounding_box = (4195669.333333333, 4998144, 4724053.333333333, 5766144)

In [4]:
def generate_sub_bboxes(bounding_box):
    
    '''
    Runs generate_ranges() function on a specified bounding box and appends new sub-bboxes as string to a list. 

    Arguments:
        bounding_box : bounding box in string format. 

    Returns: 
    
        boxes : list of sub-bounding boxes. 
    '''

    boxes = []

    for sub_bbox in generate_ranges(bounding_box):
        boxes.append(sub_bbox)

    boxes = [",".join(map(str, b)) for b in boxes]

    return(boxes)

In [6]:
random.seed(12)

boxes_sample = random.sample(boxes, 1000)

In [7]:
max_retries = 5  # Set the maximum number of retries
retry_delay = 2  # Set the delay time between retries (in seconds)

retry_count = 0

for box in tqdm(boxes_sample):
    
    try:
        
        url = 'https://geoservices.bayern.de/mapserver4bauleitbvv/bauleitplan_intern?VERSION=1.1.1&REQUEST=GetFeatureInfo&SRS=EPSG:31468&LAYERS=bplan_rechtskraft_lvg&STYLES=&BBOX='+box+'&WIDTH=4&HEIGHT=4&QUERY_LAYERS=bplan_rechtskraft_lvg&X=2&Y=2&FORMAT=image/png&INFO_FORMAT=text/html&FEATURE_COUNT=5000&EXCEPTIONS=application/vnd.ogc.se_xml'

        data = scrape_geoservices_api(url)
            
        data.to_csv(f'data/geoservices_results/bounding_box_{box}.csv', index=False)

        time.sleep(10)
            
    except AttributeError as ae:

        time.sleep(10)

        continue

    except HTTPError:

        retry_count += 1 

        if retry_count < max_retries:

            time.sleep(30)

        else:

            continue


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [3:16:20<00:00, 11.78s/it] 
