In [1]:
import re
import numpy as np
from datetime import datetime
import pandas as pd
import requests
from tqdm import tqdm
pd.set_option('max_colwidth', None) # show all text in a column
pd.set_option('display.max_columns', None) # show all columns

tqdm.pandas()

import aiohttp
import asyncio

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

from utils import detect_language,remove_html_tags, remove_url



In [2]:
BMBF = pd.read_csv("../data/cleanedBMBF.csv")

In [3]:
BMBF.head(2)

Unnamed: 0,FKZ,Ressort,Referat,PT,Arb.-Einh.,Zuwendungsempfänger,Gemeindekennziffer,Stadt/Gemeinde,Ort,Bundesland,Staat,Ausführende Stelle,Gemeindekennziffer.1,Stadt/Gemeinde.1,Ort.1,Bundesland.1,Staat.1,Thema,Leistungsplansystematik,Klartext Leistungsplansystematik,Laufzeit von,Laufzeit bis,Fördersumme in EUR,Förderprofil,Verbundprojekt,Förderart
0,03F0212C/7,BMBF,724,PT-J,MGS1,Universität Rostock,13003000.0,Rostock,Rostock,Mecklenburg-Vorpommern,Deutschland,Universität Rostock - Fachbereich Elektrotechnik und Informationstechnik,13003000.0,Rostock,Rostock,Mecklenburg-Vorpommern,Deutschland,"Verbundprojekt MESSIN: Entwicklung und Systemintegration der Komponenten Navigation, Automatische Steuerung, Kommunikation und Energieversorgung für den Meßdelphin MESSIN",FB4010,"Technik-/Infrastrukturentwicklung und -bereitstellung (Überwachungssysteme, Begleitforschung zum Monitoring, innovative Technik und Geräteträger)",01.01.1998,31.08.2000,"648.462,00",Forschung und Entwicklung zur Daseinsvorsorge,MESSIN,PDIR
1,D251800/0,BMBF,314,BIBB,A3.3,"Ministerium für Wirtschaft, Arbeit und Wohnungsbau Baden-Württemberg",8111000.0,Stuttgart,Stuttgart,Baden-Württemberg,Deutschland,KACO new energy GmbH,8121000.0,Heilbronn,Heilbronn,Baden-Württemberg,Deutschland,Prozeßorientierte Entwicklungsplanung und Qualifizierung im Betrieb,OB1050,Versuchs- und Modelleinrichtungen und -programme im Bereich der beruflichen Bildung,01.07.1997,30.06.2000,"96.634,00",Forschung und Entwicklung zur Daseinsvorsorge,,PDIR


### For retrieving publications and press releases on idw , FKZ alone can not be used for finding press releases and publications related to the BMBF grants. Therefore all press releases and publications mentioned BMBF are retrieved at first. After that FKZ will be using to find matches.

In [4]:
def extract_pub_web_address(soup):
    
    # this function extract html links of idw search results
    div_elements = soup.find_all("div", class_="nine columns")
    web_address = []
    if div_elements:
        for div in div_elements:
            # Find and extract the <a> element within the div
            a_element = div.find("a")

            if a_element:
                # Extract the 'href' attribute value from the <a> element
                href_value = a_element.get("href")
                web_address.append(href_value)
                
    return web_address

In [28]:
def retrieve_pub_web_address(search_term, n_pages):
    # this function go over all pages and extract all links 
    
    url = f"https://idw-online.de/de/simplesearch?words={search_term}&_form_=InputForm&scope=press_release&scope=event&page=%s"
    pub_web_addresses = []
    # Send an HTTP GET request to the URL
    for i in tqdm(range(1, n_pages + 1)):
        try:
            response = requests.get(url % str(i))
            # Check if the request was successful
            if response.status_code == 200:
                # Parse the HTML content of the page
                soup = BeautifulSoup(response.text, "html.parser")

                pub_web_addresses.extend(extract_pub_web_address(soup))

        except Exception as e:
            print ("Error: ",e)
            
    return pub_web_addresses


In [29]:
# searching for full text such as " Bundesministerium für Bildung und Forschung" does not effective because 
# searching machine does not look for the whole string but each single word
# search results on 15/11/2023 BMBF: 21994, BMWi: 3374, BMU : 791, BMVI: 411 , BMEL : 1382, BMJV_BLE : 0  
search_result_pages = {"BMBF":1100,"BMWi":169,"BMU":40,"BMV":21,"BMEL": 70} # "BMJV_BLE":0

In [None]:
%%time
total_web_addresses = []
for search_term, pages in list(search_result_pages.items()):
    total_web_addresses.extend(retrieve_pub_web_address(search_term,n_pages = pages))

In [None]:
# the web_addresses have following structure and indicate whether the press release/publication is written in German or English:
# /en/news823662
# /de/news820498
# /de/news818640
# /en/news818518
# we only interested in publication in German, therefore we only 
total_web_addresses = [web_adresse for web_adresse in total_web_addresses if "/de/" in web_adresse]

In [None]:
# there are 25926 press releases / publications in German which mention the values in column "Ressort" 
print(len(total_web_addresses))

In [None]:
%store total_web_addresses

In [None]:
async def fetch_url(session, press_web_address, timeout=20):
    url = "https://idw-online.de" + press_web_address

    try:
        async with session.get(url,timeout = timeout) as response:
            if response.status == 200:
                html_content = await response.text()
                return press_web_address, html_content

    except Exception as e:
        print("Error:" , e)
        return None, None


async def fetch_data_from_urls(urls):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in tqdm(urls):
            tasks.append(fetch_url(session, url))
        results = await asyncio.gather(*tasks)
        return results

In [None]:
async def merge_search_content(web_addresses):
    chunk_size = 100
    contents = []
    for chunk in range(0, len(web_addresses), chunk_size):
        start = chunk
        end = min(chunk + chunk_size, len(web_addresses))
        start_time = datetime.now()
        result = await fetch_data_from_urls(web_addresses[start:end])
        contents.extend(result)
        time = datetime.now() - start_time
        print(time)
    return contents


In [None]:
contents = await merge_search_content(total_web_addresses)

In [None]:
#create a DataFrame from retrieving data
idw = pd.DataFrame(contents,columns=["web_address","content"])


In [None]:
def clean_text(text):
    
    """  function to clean text content  """
    # select relevant paragraph
    text = text.split('<h5 class="subheader">')[1].split("style")[0]
    split_word = "<div class=\"widescreenhide\"" 
    split_index = text.find(split_word)
    if split_index != -1:
        text = text[:split_index].strip()
    
    # remove html tag
    text = remove_html_tags(text)
    # remove line break
    text = text.replace('\n', ' ')
    #remove urls
    text = remove_url(text)
    
    return text
    

In [None]:
idw["content"]= idw["content"].progress_apply(clean_text)

### Running follwing cell will takes about 4 hours

In [None]:
# this cell will determine those FKZs which are mentioned in press release/publication retrieved
total_contents = "".join(idw.iloc[:,"content"].values)
for FKZ in tqdm(BMBF["FKZ"].values):
  
    if FKZ in total_contents:
        print(f"found publication of FKZ {FKZ}")
        FKZs_with_pubs.append(FKZ)

A closer examination of the FKZs reveals that there is often a space between many of them. So lets take an attempt to correct them to see whether we can find press release/publication for those FKZs. 

In [None]:
for FKZ in tqdm(BMBF["FKZ"].values):
    
    if (FKZ != FKZ.replace(" ","")) and (FKZ.replace(" ","") in total_contents):
        print(f"found publication of FKZ {FKZ}")
        FKZs_with_pubs.append(FKZ)

In [None]:
# save the list for late use sothat we dont have to run the notebook from beginning
%store -r FKZs_with_pubs

found 1 more FKZ, which has press release/publication related to it after remove space in FKZ. Total 350 FKZs

In [None]:
#this cell will map the press releases/ publications to related FKZs  
BMBF.loc[:,"idw_enrichment"] = ""
for FKZ in tqdm(FKZs_with_pubs):
    for row in range(len(idw)):
        if FKZ in idw.loc[row,"content"]:
            BMBF.loc[BMBF["FKZ"]==FKZ,"idw_enrichment"] += idw.loc[row,"content"].replace("  ","").replace("\n","")
      
        else:
            continue
                       

In [None]:
# save to file
BMBF.loc[(BMBF["FKZ"].isin(FKZs_with_pubs)) , ['FKZ','idw_enrichment']].to_csv('../data/BMBF_idw_enrichments.csv',index=False)

In [None]:
enrichments.head(5)