In [None]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import os

url="https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=200&LsZ=600&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

In [None]:
def create_urls_table_blank(step = 10, overwrite=True,file_name="urls_table.csv"):
    '''
    Puts together all the urls that would list (by accessing every one of those) all the data of the web site.
    step = how many rows has the table displayed per url.
    
    For instance if you take step = 200 the pages will show 200 companies per page.

    '''
    
    # get the last number added
    total = 271605 # according to the web the total amount of entries is this one
    total_urls = total//step
    parameters = [i*step for i in range(total_urls)] + [total]
    all_urls =[]
    for i,para in enumerate(parameters):
        url = f"https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ={step}&LsZ={para}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570"
        all_urls.append(url)
    
    df_urls = pd.DataFrame()
    
    df_urls['url'] = all_urls
    df_urls = df_urls[[col for col in df_urls.columns if 'Unn' not in col]]
    df_urls['DONE'] = 'NO'
    
    if overwrite:
        #create_urls_table_blank
        df_urls.to_csv(file_name)
    else:
        raise f"There is already a file named {file_name}: delete it manually and run this again"
    return df_urls
df_urls=create_urls_table_blank()

In [None]:
def get_remaining_urls(file_name="urls_table.csv"):
    '''
    returns a list with all the urls still to do
    '''
    df = pd.read_csv(file_name)
    
    mask = df['DONE'] =='NO'
    #only gives back the data not mark as DONE YES
    result = df.loc[mask,'url'].tolist()
    return result


In [None]:
display(df_urls)

In [None]:
# exmple of one url
one_url = df_urls.iloc[[0]]['url'].tolist()[0]
one_url

In [None]:
def mark_one_url_as_done(query_value_for_column, 
                         col_input='url',
                         col_to_change = 'DONE',
                         value_for_change = 'YES',
                         file_name="urls_table.csv"):
    '''
    since we dont want to make all the 270000 calls in one go this is a way to mark up bit a bit what was done
    '''
    df = pd.read_csv(file_name)
    if col_input not in df.columns:
        raise f"wooow pass a col name that is present in the dataframe please. {col_input} is not in {df.columns}"
    # read file and modify it
    
    mask1 = df[col_input] == query_value_for_column
    # caveats Try using .loc[row_indexer,col_indexer] = value instead
    df.loc[mask1,col_to_change]=value_for_change
    df.to_csv(file_name, index=False)
    return # nothing is returned because the table is saved again

In [None]:
def get_data_of_one_url_table(url, DEBUG=False):
    '''
    Make a GET request to fetch the raw HTML content of the url
    INPUT:
        url: like: 'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=2&LsZ=2&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570'
    RETURNS:
        list of dicts in the form of:
        [{'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
          {...
          ]
    
    This lists of dicts corresponds to what you see when accesing one single page
    
    
    '''
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "lxml")
    
    table = soup.find("table",{"class":"display dataTable"})
    if DEBUG: print('>>>>>>>>>>>>>>>>>>>>>>',url, table)
    
    all_data=[]
    for i,row in enumerate(table.find_all('tr')):
        if row.td is not None:
            row_data_per_company = [(td.text,td.a.get('href')) if td.a and td.a.get('href') else (td.text,'') for td in row.find_all('td')]
            elements_of_one_row=[]
            
            for tupple in row_data_per_company:
                elements_of_one_row.extend(tupple)
            if DEBUG: print(elements_of_one_row)
            keys = ['Foerdersumme', 'link_total_sum','Begünstigter','link_company','Landkreis','link_location','BundesLand','link_lander']
            d = dict(zip(keys,elements_of_one_row))
            
            all_data.append(d)
    
    return all_data

In [None]:
# example:
DEBUG=False
if DEBUG:
    url='https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=2&LsZ=2&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570'
    print(url)
    get_data_of_one_url_table(url)

In [None]:
def get_basis_praemie_from_url(company_link):
    html_content = requests.get(company_link).text
    soup = BeautifulSoup(html_content, "lxml")
    try:
        t3 = soup.find(lambda tag:tag.name=="h3" and ": Basisprämie").find_next_sibling().find_next_sibling("p").find("span").contents
    except:
        t3 = ''
    basis_praemie = t3
    if type(basis_praemie)==type(["unalista"]):
        basis_praemie = basis_praemie[0]
    
    return basis_praemie

In [None]:
def add_basispramie_to_one_company(list_of_company_dicts):
    '''
    INPUT:
        {'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
  
  OUTPUT:
      {'total_sum': '12.649.000,18',
      'link_total_sum': '',
      'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
      'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
      'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
      'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
      'Lander': 'Mecklenburg-Vorpommern',
      'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
      'basis_praemie': ....
  
  Explanation:
  Out of the dict corresponding to one company the url is built up and the data is fetched with requeswt and parsed with soup
  
    '''
    new_list_of_company_dicts = []
    for company_dict in list_of_company_dicts:
        basis_link = 'https://proplanta.de'
        company_link = basis_link + company_dict['link_company']
        company_dict['company_url'] = company_link
        company_dict['basis_praemie'] = get_basis_praemie_from_url(company_link)
        new_list_of_company_dicts.append(company_dict)
    return new_list_of_company_dicts

In [None]:
def add_dict_to_df(dict_to_add,
                   file_name = 'final_result.csv',
                  DEBUG=False):
    df_to_add=pd.DataFrame(dict_to_add)
    if os.path.exists(file_name):
        df= pd.read_csv(file_name)
        
    # add the new data
        out = pd.concat([df_to_add, df])
        if DEBUG:
            display(df)
            display(df_to_add)
    else:
        out=df_to_add
    out.to_csv(file_name, index=False) 

# main code

In [None]:

# all the urls with the parameters decided at the beginning, how many per page.
all_urls_for_all_data_still_to_do = get_remaining_urls()[40:41]

# Update [0:2] by number of URLs you want to process 
for url_table in all_urls_for_all_data_still_to_do:
    print('>>>',url_table)
    # table of companies
    data_for_one_table = get_data_of_one_url_table(url_table)
    #print(len(data_for_one_table))
    # print(data_for_one_table)
    data_for_one_table_with_basis_premie = add_basispramie_to_one_company(data_for_one_table)
    #print(data_for_one_table_with_basis_premie)
    # mark this url as done:
    mark_one_url_as_done(url_table,
                         col_input='url',
                         col_to_change = 'DONE',
                         value_for_change = 'YES',
                         file_name="urls_table.csv")
                         
    
    add_dict_to_df(data_for_one_table_with_basis_premie)


# from here all try/error code

In [None]:
example_with_praemie="""https://proplanta.de/karten/agrarsubventionen_2021-empfaenger12afd5006d05dc993dce273bf905c760.html"""
print(get_basis_praemie_from_url(example_with_praemie))

In [None]:
def add_company_data(nr=1, DEBUG=False):
    '''
    adds single comkpany data one row at a time
    
    '''
    df_urls=pd.read_csv("urls_table.csv")
    df_companies = pd.DataFrame()
    for i,url in enumerate(df_urls['url'].tolist()[0:nr]): 
        html_content = requests.get(url).text
        soup = BeautifulSoup(html_content, "lxml")
    
        table = soup.find("table",{"class":"display dataTable"})
        if DEBUG: print('>>>>>>>>>>>>>>>>>>>>>>',i,url)
        if DEBUG: print(table)
        for row in table.find_all('tr'):
            # Find all data for each column
            columns = row.find_all('td')
            
add_company_data(DEBUG=True)

In [None]:
def get_data_of_one_url_table(url, DEBUG=False):
    '''
    Make a GET request to fetch the raw HTML content of the url
    INPUT:
        url: like: 'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=2&LsZ=2&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570'
    RETURNS:
        list of dicts in the form of:
        [{'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
          {...
          ]
    
    This lists of dicts corresponds to what you see when accesing one single page
    
    
    '''
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "lxml")
    
    table = soup.find("table",{"class":"display dataTable"})
    if DEBUG: print('>>>>>>>>>>>>>>>>>>>>>>',i,url)
    
    for row in table.find_all('tr'):
        columns = row.select('td')
        if DEBUG: print(columns)
        row_data_per_company = [(td.text,td.a.get('href')) if td.a and td.a.get('href') else (td.text,'') for td in columns]
        #flatten the list of tuples
        elements_of_one_row = []
        for tupple in row_data_per_company:
            elements_of_one_row.extend(tupple)
        # create a dict for easier understanding
        if elements_of_one_row != []:
            keys = ['Foerdersumme', 'link_total_sum','Begünstigter','link_company','Landkreis','link_location','BundesLand','link_lander']
            d = dict(zip(keys,elements_of_one_row))
            all_data.append(d)
            
    return all_data

In [None]:
    

def add_basispramie_to_one_company(company_dict):
    '''
    INPUT:
        {'total_sum': '12.649.000,18',
          'link_total_sum': '',
          'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
          'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
          'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
          'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
          'Lander': 'Mecklenburg-Vorpommern',
          'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
  
  OUTPUT:
      {'total_sum': '12.649.000,18',
      'link_total_sum': '',
      'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
      'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
      'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
      'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
      'Lander': 'Mecklenburg-Vorpommern',
      'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'},
      'basis_praemie': ....
  
  Explanation:
  Out of the dict corresponding to one company the url is built up and the data is fetched with requeswt and parsed with soup
  
    '''
    
    basis_link = 'https://proplanta.de'
    company_link = basis_link + company_dict['link_company']
    company_dict['company_url'] = company_link
    
    basis_praemie = get_basis_praemie_from_url(company_link)
    
    company_dict['basis_praemie'] = basis_praemie
    
    return company_dict
    
    

In [None]:
data_one_page = get_data_of_one_url_table(all_urls[0])

In [None]:
example_dict = {'total_sum': '12.649.000,18',
  'link_total_sum': '',
  'company': 'Land Mecklenburg-Vorpommern Ministerium für – 19061 Schwerin, Landeshauptstadt',
  'link_company': '/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html',
  'location': 'Kreisfreie Stadt Schwerin, Landeshauptstadt',
  'link_location': '/karten/kreisfreie_stadt_schwerin,_landeshauptstadt-agrarsubventionen_2021-landkreis_13004.html',
  'Lander': 'Mecklenburg-Vorpommern',
  'link_lander': '/karten/mecklenburg-vorpommern-agrarsubventionen_2021-bundesland_5a61ca66ef23199eb6afa5e9bb4779f1.html'}

adding_basis = add_basispramie_to_one_company(example_dict)
adding_basis

In [None]:
# This is a small example of two urls that list only 2 companies per table each one.
L = 2
N=2
all_urls = [f'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ={L}&LsZ={N}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570',
           f'https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ={L}&LsZ={N}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570']
all_urls



In [None]:
len(elements_of_one_row)

In [None]:
myhtml='''<table class="theone">
<td class="even" ><strong>12.649.000,18</strong></td>
 <td class="even" ><a href="/whatever/1.html" target="_blank">Brusels</a></td>
 <td class="even" ><a href="/whatever/2.html" target="_blank">Belgium</a></td>
 <td class="even" >blue</td></table>'''
soup = BeautifulSoup(myhtml, "lxml")
    
table = soup.find("table",{"class":"theone"})
table

In [None]:
url="https://www.proplanta.de/karten/agrarsubventionen_2021-empfaengerdf84d31e9962677007bce3207ac3bee9.html"
url_no_basis = "https://proplanta.de/karten/agrarsubventionen_2021-empfaenger22becefcfe9631bdec63c6f251515d00.html"
print(url)
html_content = requests.get(url_no_basis).text
soup = BeautifulSoup(html_content, "lxml")
t3 = soup.find(lambda tag:tag.name=="h3" and ": Basisprämie").find_next_sibling().find_next_sibling("p").find("span").contents
t3

In [None]:
#print(soup)<div id="agrar_subventions_empfaenger">
def my_span(tag):
    return tag.name=='span' and "betrag" in tag

#tags = soup.find_all(my_span)
result2 =soup.find("span", {"class":"betrag"})
dir(result2)

In [None]:
t =soup.find(lambda tag:tag.name=="h3" and ": Basisprämie")
t2 =t.find_next_sibling().find_next_sibling("p").find("span").contents
t2

In [None]:
for element in t.find_next_sibling():
    print(element.contents)

In [None]:
parameters = [i*200 for i in range(5)] # here substitute 20 by 1359

all_urls = [] 
for j,para in enumerate(parameters):
    # building the url
    url = f"https://www.proplanta.de/Agrarsubventionen-2021-Liste-der-Empfaenger/proplanta_karten.php?ROalAk=271605&LaZ=200&LsZ={para}&ROalAk=271605&SELECTID=1653383570&SEARCH_SHOWBEGS=1653383570"
    all_urls.append(url)

all_tables_of_all_urls = []
for url in all_urls:
    all_tables_of_url = pd.read_html(url)
    good_tables_of_single_url = []
    for i,table in enumerate(all_tables):
        # decide if it is a good table
        cn=table.columns.tolist()
        if 'Begünstigter' in cn and 'Landkreis' in cn and 'Bundesland' in cn:
            good_tables_of_single_url.append(table)
    df_of_single_url_page = pd.concat(good_tables_of_single_url)
    all_tables_of_all_urls.append(df_of_single_url_page)

In [None]:
df_all = pd.concat( all_tables_of_all_urls)

In [None]:
df_all
