In [1]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import os
import time


In [2]:
def create_urls_table_blank(step = 10, overwrite=True,file_name="urls_table.csv"):
    '''
    Puts together all the urls that would list (by accessing every one of those) all the data of the web site.
    step = how many rows has the table displayed per url.
    
    For instance if you take step = 200 the pages will show 200 companies per page.

    '''
    
    # get the last number added
    total = 271605 # according to the web the total amount of entries is this one
    total_urls = total//step
    parameters = [i*step for i in range(total_urls)] + [total]
    all_urls =[]
    for i,para in enumerate(parameters):
        url = f"https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ={step}&LsZ={para}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570"
        all_urls.append(url)
    
    df_urls = pd.DataFrame()
    
    df_urls['url'] = all_urls
    df_urls = df_urls[[col for col in df_urls.columns if 'Unn' not in col]]
    df_urls['DONE'] = 'NO'
    
    if overwrite:
        #create_urls_table_blank
        df_urls.to_csv(file_name)
        rows = len(df_urls)
        print(f"{file_name} created with {rows} rows, with every row being a table of {step} companies.")
    else:
        raise f"There is already a file named {file_name}: delete it manually and run this again"
    
    return df_urls


In [3]:
def get_remaining_urls(step=10,file_name="urls_table.csv"):
    '''
    returns a list with all the urls still to do
    '''
    if os.path.exists(file_name) == False:
        create_urls_table_blank(step)
        
    df = pd.read_csv(file_name)
    rows = len(df)
    
    mask = df['DONE'] =='NO'
    #only gives back the data not mark as DONE YES
    result = df.loc[mask,'url'].tolist()
    print(f"{file_name} read with {rows} rows with {len(result)} remaining to be dowloaded.")
    return result


In [4]:
def mark_one_url_as_done(query_value_for_column, 
                         col_input='url',
                         col_to_change = 'DONE',
                         value_for_change = 'YES',
                         file_name="urls_table.csv"):
    '''
    since we dont want to make all the 270000 calls in one go this is a way to mark up bit a bit what was done
    '''
    df = pd.read_csv(file_name)
    if col_input not in df.columns:
        raise f"wooow pass a col name that is present in the dataframe please. {col_input} is not in {df.columns}"
    # read file and modify it
    
    mask1 = df[col_input] == query_value_for_column
    # caveats Try using .loc[row_indexer,col_indexer] = value instead
    df.loc[mask1,col_to_change]=value_for_change
    df.to_csv(file_name, index=False)
    return # nothing is returned because the table is saved again

In [5]:
def get_data_of_one_url_table(url, DEBUG=False):
    '''
    Make a GET request to fetch the raw HTML content of the url
    INPUT:
        url: like: 'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=2&LsZ=2&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570'
    RETURNS:
        list of dicts in the form of:
        [{'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
          {...
          ]
    
    This lists of dicts corresponds to what you see when accesing one single page
    
    
    '''
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "lxml")
    
    table = soup.find("table",{"class":"display dataTable"})
    if DEBUG: print('>>>>>>>>>>>>>>>>>>>>>>',url, table)
    
    all_data=[]
    for i,row in enumerate(table.find_all('tr')):
        if row.td is not None:
            row_data_per_company = [(td.text,td.a.get('href')) if td.a and td.a.get('href') else (td.text,'') for td in row.find_all('td')]
            elements_of_one_row=[]
            
            for tupple in row_data_per_company:
                elements_of_one_row.extend(tupple)
            if DEBUG: print(elements_of_one_row)
            keys = ['Foerdersumme', 'link_total_sum','Begünstigter','link_company','Landkreis','link_location','BundesLand','link_lander']
            d = dict(zip(keys,elements_of_one_row))
            
            all_data.append(d)
    
    return all_data

In [6]:
# example:
DEBUG=False
if DEBUG:
    url='https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=2&LsZ=2&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570'
    print(url)
    get_data_of_one_url_table(url)

In [7]:
def get_basis_praemie_from_url(company_link,DEBUG=False):
    if DEBUG: print("in soup:",company_link)
    html_content = requests.get(company_link).text
    soup = BeautifulSoup(html_content, "lxml")
    try:
        t3 = soup.find(lambda tag:tag.name=="h3" and "Basisprämie" in tag.text).find_next_sibling().find_next_sibling("p").find("span").contents
    except:
        t3 = ''
    basis_praemie = t3
    if type(basis_praemie)==type(["unalista"]):
        basis_praemie = basis_praemie[0]
    
    return basis_praemie

In [9]:
DEBUG=False
if DEBUG:
    # no baispramie
    company_link1 = "https://proplanta.de/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html"
    # with basis pramie
    company_link2 = "https://www.proplanta.de/karten/agrarsubventionen_2021-empfaengerb23c81b03f98afededf2dce2fbb239ea.html"
    company_link= company_link1
    print(company_link)
    print(company_link)
    html_content = requests.get(company_link).text
    soup = BeautifulSoup(html_content, "lxml")
    print(get_basis_praemie_from_url(company_link))
    print(soup)
    

In [10]:
def add_basispramie_to_one_company(list_of_company_dicts, DEBUG=False):
    '''
    INPUT:
        [{'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
  ]
  OUTPUT:
      {'total_sum': '12.649.000,18',
      'link_total_sum': '',
      'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
      'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
      'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
      'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
      'Lander': 'Mecklenburg-Vorpommern',
      'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
      'basis_praemie': NEW DATA HERE
  
  Explanation:
  Out of the dict corresponding to one company the url is built up and the data is fetched with requeswt and parsed with soup
  
    '''
    new_list_of_company_dicts = []
    for company_dict in list_of_company_dicts:
        basis_link = 'https://proplanta.de'
        company_link = basis_link + company_dict['link_company']
        company_dict['company_url'] = company_link
        company_dict['basis_praemie'] = get_basis_praemie_from_url(company_link, DEBUG)
        new_list_of_company_dicts.append(company_dict)
    return new_list_of_company_dicts

In [11]:
DEBUG=False
if DEBUG:
    example="https://www.proplanta.de/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html"
    example2 = [{'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
  ]
    print(add_basispramie_to_one_company(example2, DEBUG=True))
    

In [12]:
def add_dict_to_df(dict_to_add,
                   file_name = '',
                   DEBUG=False):
    df_to_add=pd.DataFrame(dict_to_add)
    
    if file_name == '':
        file_name = 'result' + time.strftime("-%d-%m-%Y") + ".csv"
        
    if os.path.exists(file_name):
        df= pd.read_csv(file_name)
        
        # add the new data
        out = pd.concat([df_to_add, df])
        if DEBUG:
            display(df)
            display(df_to_add)
    else:
        out=df_to_add
    rows = len(out)
    print(f"total {rows} companies")
    out.to_csv(file_name, index=False) 

# main code

In [13]:
# The following are the only two parameters to modify and then run the next cell
step = 7 # mumber of rows per table in every call to the website
nr_of_tables_to_download_today = 5

first_time_running_the_process = False
if first_time_running_the_process:
    create_urls_table_blank(step=step)
    
all_urls_for_all_data_still_to_do = get_remaining_urls(file_name="urls_table.csv")   

# all the urls with the parameters decided at the beginning, how many per page.
if len(all_urls_for_all_data_still_to_do)>nr_of_tables_to_download_today:
    all_urls_for_all_data_still_to_do = all_urls_for_all_data_still_to_do[:nr_of_tables_to_download_today]
else:
    all_urls_for_all_data_still_to_do = all_urls_for_all_data_still_to_do

# Update [0:2] by number of URLs you want to process 
for i, url_table in enumerate(all_urls_for_all_data_still_to_do):
    print(i, ' of ', len(all_urls_for_all_data_still_to_do), ' >>>',url_table)
    # table of companies
    data_for_one_table = get_data_of_one_url_table(url_table)
    #print(len(data_for_one_table))
    #print(data_for_one_table)
    data_for_one_table_with_basis_premie = add_basispramie_to_one_company(data_for_one_table)
    #print(data_for_one_table_with_basis_premie)
    # mark this url as done:
    mark_one_url_as_done(url_table,
                         col_input='url',
                         col_to_change = 'DONE',
                         value_for_change = 'YES',
                         file_name="urls_table.csv")
                         
    
    add_dict_to_df(data_for_one_table_with_basis_premie)


urls_table.csv read with 27161 rows with 27156 remaining to be dowloaded.
0  of  5  >>> https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=10&LsZ=50&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570
total 10 companies
1  of  5  >>> https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=10&LsZ=60&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570
total 20 companies
2  of  5  >>> https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=10&LsZ=70&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570
total 30 companies
3  of  5  >>> https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=10&LsZ=80&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570
total 40 companies
4  of  5  >>> https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfae

# from here all try/error code

In [None]:

url="https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=200&LsZ=600&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
example_with_praemie="""https://proplanta.de/karten/agrarsubventionen_2021-empfaenger12afd5006d05dc993dce273bf905c760.html"""
print(get_basis_praemie_from_url(example_with_praemie))

In [None]:
def add_company_data(nr=1, DEBUG=False):
    '''
    adds single comkpany data one row at a time
    
    '''
    df_urls=pd.read_csv("urls_table.csv")
    df_companies = pd.DataFrame()
    for i,url in enumerate(df_urls['url'].tolist()[0:nr]): 
        html_content = requests.get(url).text
        soup = BeautifulSoup(html_content, "lxml")
    
        table = soup.find("table",{"class":"display dataTable"})
        if DEBUG: print('>>>>>>>>>>>>>>>>>>>>>>',i,url)
        if DEBUG: print(table)
        for row in table.find_all('tr'):
            # Find all data for each column
            columns = row.find_all('td')
            
add_company_data(DEBUG=True)

In [None]:
def get_data_of_one_url_table(url, DEBUG=False):
    '''
    Make a GET request to fetch the raw HTML content of the url
    INPUT:
        url: like: 'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=2&LsZ=2&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570'
    RETURNS:
        list of dicts in the form of:
        [{'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
          {...
          ]
    
    This lists of dicts corresponds to what you see when accesing one single page
    
    
    '''
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "lxml")
    
    table = soup.find("table",{"class":"display dataTable"})
    if DEBUG: print('>>>>>>>>>>>>>>>>>>>>>>',i,url)
    
    for row in table.find_all('tr'):
        columns = row.select('td')
        if DEBUG: print(columns)
        row_data_per_company = [(td.text,td.a.get('href')) if td.a and td.a.get('href') else (td.text,'') for td in columns]
        #flatten the list of tuples
        elements_of_one_row = []
        for tupple in row_data_per_company:
            elements_of_one_row.extend(tupple)
        # create a dict for easier understanding
        if elements_of_one_row != []:
            keys = ['Foerdersumme', 'link_total_sum','Begünstigter','link_company','Landkreis','link_location','BundesLand','link_lander']
            d = dict(zip(keys,elements_of_one_row))
            all_data.append(d)
            
    return all_data

In [None]:
    

def add_basispramie_to_one_company(company_dict):
    '''
    INPUT:
        {'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
  
  OUTPUT:
      {'total_sum': '12.649.000,18',
      'link_total_sum': '',
      'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
      'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
      'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
      'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
      'Lander': 'Mecklenburg-Vorpommern',
      'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
      'basis_praemie': ....
  
  Explanation:
  Out of the dict corresponding to one company the url is built up and the data is fetched with requeswt and parsed with soup
  
    '''
    
    basis_link = 'https://proplanta.de'
    company_link = basis_link + company_dict['link_company']
    company_dict['company_url'] = company_link
    
    basis_praemie = get_basis_praemie_from_url(company_link)
    
    company_dict['basis_praemie'] = basis_praemie
    
    return company_dict
    
    

In [None]:
data_one_page = get_data_of_one_url_table(all_urls[0])

In [None]:
example_dict = {'total_sum': '12.649.000,18',
  'link_total_sum': '',
  'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
  'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
  'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
  'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
  'Lander': 'Mecklenburg-Vorpommern',
  'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'}

adding_basis = add_basispramie_to_one_company(example_dict)
adding_basis

In [None]:
# This is a small example of two urls that list only 2 companies per table each one.
L = 2
N=2
all_urls = [f'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ={L}&LsZ={N}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570',
           f'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ={L}&LsZ={N}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570']
all_urls



In [None]:
len(elements_of_one_row)

In [None]:
myhtml='''<table class="theone">
<td class="even" ><strong>12.649.000,18</strong></td>
 <td class="even" ><a href="/whatever/1.html" target="_blank">Brusels</a></td>
 <td class="even" ><a href="/whatever/2.html" target="_blank">Belgium</a></td>
 <td class="even" >blue</td></table>'''
soup = BeautifulSoup(myhtml, "lxml")
    
table = soup.find("table",{"class":"theone"})
table

In [None]:
url="https://www.proplanta.de/karten/agrarsubventionen_2021-empfaengerdf84d31e9962677007bce3207ac3bee9.html"
url_no_basis = "https://proplanta.de/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html"
print(url)
html_content = requests.get(url_no_basis).text

# EXMAPLES beautiful soup

In [66]:
#with basis pramie
html_content1="""<h3 style="cear: both;"><abbr title="Europäischer Garantiefonds für Landwirtschaft">EGFL</abbr>: Basisprämie</h3>
<div class="massnahmenbeschreibung">
<p>Infolge der Reform der Gemeinsamen Agrarpolitik (GAP) wurde die bis einschließlich 2014 geltende Betriebsprämie durch ein System aus Direktzahlungen bestehend aus Basisprämie, Umverteilungsprämie, Greeningprämie und ggf. Junglandwirteprämie ersetzt.Die Basisprämie entspricht in ihrem Charakter im Grundsatz der bisherigen Betriebsprämie. Sie basiert ebenfalls auf einem System von Zahlungsansprüchen, die den Betriebsinhabern grundsätzlich im Jahr 2015 auf Antrag entsprechend dem Umfang der von ihnen angemeldeten beihilfefähigen Flächen neu zugewiesen wurden. Der Betriebsinhaber meldet in jedem Antragsjahr seine beihilfefähigen Flächen und Zahlungsansprüche an, wobei die Aktivierung eines Zahlungsanspruchs mit einem Hektar beihilfefähiger Fläche zur Auszahlung der Basisprämie führt.Alle Zahlungsansprüche in einer Region (in der Regel = Bundesland) hatten zu Beginn der derzeitigen Förderperiode in 2015 denselben Wert. Die je Region unterschiedlich hohen Werte der Zahlungsansprüche wurden zwischen 2017 und 2019 schrittweise abgebaut, so dass sie seit 2019 in ganz Deutschland einen einheitlichen Wert haben.Die Basisprämie dient der Einkommenssicherung und Risikoabsicherung der landwirtschaftlichen Betriebe sowie auch als finanzieller Ausgleich für die weit höheren Umweltschutz-, Tierschutz- und Verbraucherschutzstandards in der EU im Vergleich zu den Produktionsauflagen von Mitbewerbern auf dem Weltmarkt. Sie ist wie alle anderen Direktzahlungen unmittelbar an die Einhaltung zahlreicher weiterer Auflagen gebunden (sog. "Cross-Compliance-Instrument"). Neben 13 schon bestehenden EU-Verordnungen und Richtlinien des Natur-, Umwelt-, Tier- und Verbraucherschutzes, deren Einhaltung laufend und streng überprüft wird, sind Vorgaben zur Erosionsvermeidung als zusätzlich zu erbringende Leistungen ebenso vorgeschrieben worden wie Maßnahmen zur Erhaltung der Bodenfruchtbarkeit und zum Gewässerschutz. Ebenso ist die Beseitigung von Landschaftselementen wie Hecken, Baumreihen und Feldgehölzen verboten. Durch Mindestanforderungen an die Bodenbedeckung bei aus der Produktion genommenen Flächen und dem Erhalt von ökologisch wertvollen Strukturelementen als Rückzugsgebiete für wildlebende Tierarten in intensiv genutzten Agrarlandschaften leisten die Direktzahlungen so einen Beitrag zum Erhalt landeskultureller Werte und zum Klimaschutz.</p></div>
<p style="margin-bottom: 0; text-align: right;">
<span class="betrag">322.347,04 €</span>
</p>
"""
# without basispremie
html_content2 ="""
<h3 style="cear: both;">
<abbr title="European Union">EU</abbr>Investment</h3>
<div class="conditions">
<p>bla bla bla
</p>
</div>
<p style="margin-bottom: 0;">
<span class="amount">66000 €</span>
</p>"""

In [73]:
html_content=html_content2
soup = BeautifulSoup(html_content, "lxml")

In [74]:
t3 = (soup.find(lambda tag: tag.name == "h3" and "Basisprämie" in tag.text).find_next("span", class_="amount").text)
t3

AttributeError: 'NoneType' object has no attribute 'find_next'

In [77]:
t3 = (soup.find(lambda tag: tag.name == "h3" and "Investment" in tag.text).find_next("span", class_="amount").text)
t3

'66000 €'

In [57]:
t4=soup.find(lambda tag:tag.name=="h3" and 'Investment' in tag.text)

In [58]:
print(t4.text)
print([type(r) for r in t4.contents])


EUInvestment
[<class 'bs4.element.NavigableString'>, <class 'bs4.element.Tag'>, <class 'bs4.element.NavigableString'>]


In [None]:
#print(soup)<div id="agrar_subventions_empfaenger">
def my_span(tag):
    return tag.name=='span' and "betrag" in tag

#tags = soup.find_all(my_span)
result2 =soup.find("span", {"class":"betrag"})
dir(result2)

In [None]:
t =soup.find(lambda tag:tag.name=="h3" and ": Basisprämie")
t2 =t.find_next_sibling().find_next_sibling("p").find("span").contents
t2

In [None]:
for element in t.find_next_sibling():
    print(element.contents)

In [None]:
parameters = [i*200 for i in range(5)] # here substitute 20 by 1359

all_urls = [] 
for j,para in enumerate(parameters):
    # building the url
    url = f"https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=200&LsZ={para}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570"
    all_urls.append(url)

all_tables_of_all_urls = []
for url in all_urls:
    all_tables_of_url = pd.read_html(url)
    good_tables_of_single_url = []
    for i,table in enumerate(all_tables):
        # decide if it is a good table
        cn=table.columns.tolist()
        if 'Begünstigter' in cn and 'Landkreis' in cn and 'Bundesland' in cn:
            good_tables_of_single_url.append(table)
    df_of_single_url_page = pd.concat(good_tables_of_single_url)
    all_tables_of_all_urls.append(df_of_single_url_page)

In [None]:
df_all = pd.concat( all_tables_of_all_urls)

In [None]:
df_all


In [None]:

today = time.strftime("%Y-%m-%d")
