# European Central Bank Interview Scrapper

This notebook contains a function named scrape_ecb_interviews(), serving as a pipeline to automate the process of opening the browser, navigating to ecb.europa.eu...

In [1]:
#!pip install selenium

In [2]:
import json
import pandas as pd
import numpy as np
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from concurrent.futures import ThreadPoolExecutor
import requests
import os
import warnings

# Ignore SettingWithCopyWarning
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")

# Go get geckodriver from : https://github.com/mozilla/geckodriver/releases

  from pandas.core import (


### Initialization functions:

In [3]:
def start_up(link, geko_path, profile_path=None, browser=None):
        """
        Function to set up the browser and open the selected link.

        Args:
            link (str): The URL to open.
            geko_path (str): Path to the Gecko driver executable.
            profile_path (str): Path to the Firefox profile to be used if there is any.
            browser: Optional existing webdriver instance.

        Returns:
            browser: The initialized webdriver instance.
        """
        if not browser:
            if profile_path:
                firefox_options = webdriver.FirefoxOptions()
                firefox_options.add_argument(f'--profile={profile_path}')
                service = Service(geko_path)
                browser = webdriver.Firefox(service=service, options=firefox_options)
            else:
                profile = webdriver.FirefoxProfile()
                options = Options()
                options.profile = profile
                service = Service(geko_path)
                browser = webdriver.Firefox(service=service, options=options)
        browser.get(link)
        time.sleep(2)
        return browser


### Paths:

In [4]:
# Set the path to the geckodriver executable:
geko_path='C:/Users/School/Downloads/geckodriver-v0.34.0-win64/geckodriver.exe'

# If fifefox profile is needed, set the path to the profile:
profile_path = ''

### Scrape Metadata:

This first part scrappes the content from the selection panel with the available interviews and their metadata, hence the date, the media where the interview has been published, the member of the ECB conducting the interview and the link to the said interview.

In [5]:
def scroll_to_end(driver, pause_time=2):
    # Function to scroll to the end of the page
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_time)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [6]:
years = ["2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022","2023","2024"]

def scrape_ecb_interviews(geko_path):
    data = []
    
    for year in years:
        link_ = f'https://www.ecb.europa.eu/press/pubbydate/html/index.en.html?name_of_publication=Interview&year={year}'

        # Start browser
        browser=start_up(link=link_,geko_path=geko_path)

        # There is a maximum number of interviews loaded per page (if all years are selected) -> need to go year by year!
        # The TOPICS on the ECB webpage only go back until 2020

        # Scroll to the end of the page to load all content
        scroll_to_end(browser)

        interviews_placeholders = browser.find_elements('xpath','/html/body/div[3]/main/div[2]/div[3]/div[2]/div[2]/dl/*')
        print("Number of interviews for year: ", year," -> ",len(interviews_placeholders)/2)
        for interview in interviews_placeholders:
            if interview.tag_name == 'dt':
                # This is the date
                date = interview.text
            if interview.tag_name == 'dd':
                # This contains the link, title, and authors
                link_element = interview.find_element(By.TAG_NAME, 'a')
                url = link_element.get_attribute('href')
                content = link_element.text

                authors_element = interview.find_element(By.CLASS_NAME, 'authors')
                authors_list = authors_element.find_elements(By.TAG_NAME, 'li')
                author_name = authors_list[0].text.strip()  # Get the first author's name

                # Store the data in a dictionary
                data.append({
                    "Date": date,
                    "Media": content,
                    "Member": author_name,
                    "Link": url
                })
        
        # Close the browser
        browser.quit()
    
    return data

data_scrapped = scrape_ecb_interviews(geko_path)
df = pd.DataFrame(data_scrapped)
print("Shape of the dataframe: ",df.shape)

Number of interviews for year:  2005  ->  18.0
Number of interviews for year:  2006  ->  6.0
Number of interviews for year:  2007  ->  8.0
Number of interviews for year:  2008  ->  9.0
Number of interviews for year:  2009  ->  12.0
Number of interviews for year:  2010  ->  17.0
Number of interviews for year:  2011  ->  20.0
Number of interviews for year:  2012  ->  8.0
Number of interviews for year:  2013  ->  16.0
Number of interviews for year:  2014  ->  33.0
Number of interviews for year:  2015  ->  42.0
Number of interviews for year:  2016  ->  39.0
Number of interviews for year:  2017  ->  29.0
Number of interviews for year:  2018  ->  28.0
Number of interviews for year:  2019  ->  38.0
Number of interviews for year:  2020  ->  74.0
Number of interviews for year:  2021  ->  46.0
Number of interviews for year:  2022  ->  37.0
Number of interviews for year:  2023  ->  39.0
Number of interviews for year:  2024  ->  9.0
Shape of the dataframe:  (528, 4)


In [7]:
df.head()

Unnamed: 0,Date,Media,Member,Link
0,23 December 2005,Interview with Der Spiegel,Jean-Claude Trichet,https://www.ecb.europa.eu/press/inter/date/200...
1,19 December 2005,Interview with Hospodářské Noviny,Otmar Issing,https://www.ecb.europa.eu/press/inter/date/200...
2,19 December 2005,Interview with Financial Times and Financial T...,Lucas Papademos,https://www.ecb.europa.eu/press/inter/date/200...
3,15 December 2005,Interview with Paris Match,Jean-Claude Trichet,https://www.ecb.europa.eu/press/inter/date/200...
4,9 December 2005,Interview in Il Giornale,Lorenzo Bini Smaghi,https://www.ecb.europa.eu/press/inter/date/200...


In [8]:
# Chack for NA values
df.isna().sum()

Date      0
Media     0
Member    0
Link      0
dtype: int64

In [9]:
# Save the data
df.to_csv('data_metadata.csv', index=False)

### Scrape the interviews

Using the link scrapped in the previous step, we will scrape the content of each interview and make sure we separate the questions of the reporter from the answer of the ECB member.

In [10]:
# Load the data
df = pd.read_csv('data_metadata.csv')
print("Shape of the dataframe: ",df.shape)
df.head()

Shape of the dataframe:  (528, 4)


Unnamed: 0,Date,Media,Member,Link
0,23 December 2005,Interview with Der Spiegel,Jean-Claude Trichet,https://www.ecb.europa.eu/press/inter/date/200...
1,19 December 2005,Interview with Hospodářské Noviny,Otmar Issing,https://www.ecb.europa.eu/press/inter/date/200...
2,19 December 2005,Interview with Financial Times and Financial T...,Lucas Papademos,https://www.ecb.europa.eu/press/inter/date/200...
3,15 December 2005,Interview with Paris Match,Jean-Claude Trichet,https://www.ecb.europa.eu/press/inter/date/200...
4,9 December 2005,Interview in Il Giornale,Lorenzo Bini Smaghi,https://www.ecb.europa.eu/press/inter/date/200...


In [11]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

# Function to scrape the descriptions using Beautiful Soup
def scrape_interviews(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status() 
        time.sleep(0.5)
    except requests.exceptions.RequestException as e:
        print(f"Error processing {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the "Information" from <h2>
    information = soup.find('h2', class_='ecb-pressContentSubtitle')
    if information:
        information_text = information.get_text(strip=True)
    else:
        information_text = "Information not found"

    # Extract questions and answers
    questions = []
    answers = []
    
    # Find all <p> tags and classify them based on their content
    paragraphs = soup.find_all('p')

    if len(paragraphs) > 10:
        relevant_paragraphs = paragraphs[5:-5]  # Ignore the first and last five, they contain copyrights elements
    else:
        relevant_paragraphs = []
    
    for paragraph in relevant_paragraphs:
        strong_tag = paragraph.find('strong')
        if strong_tag:
            # This is a question
            questions.append(strong_tag.get_text(strip=True))
        else:
            # This is an answer
            answers.append(paragraph.get_text(strip=True))
    
    # Return a dictionary with the structured information
    return information_text,"[end_question] ".join(questions),"[end_answer] ".join(answers)


In [12]:
# Set the number of concurrent threads (adjust this based on the processing power of your computer)
num_threads = 16

# Create a ThreadPoolExecutor to run operations in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    information_data = []
    question_data = []
    answer_data = []
    # Use executor.map to apply the scrape_description function to each URL in parallel
    for res in enumerate(executor.map(scrape_interviews, df['Link'])):
        if res[1] is None:
            information_data.append("Information not found")
            question_data.append("Questions not found")
            answer_data.append("Answers not found")
            continue
        information_data.append(res[1][0])
        question_data.append(res[1][1])
        answer_data.append(res[1][2])

# Convert the results into a DataFrame
df['Information'] = information_data
df['Questions'] = question_data
df['Answers'] = answer_data

# Print count after all threads have completed
print(f"Scraped {len(df)} links")
print("Shape of the dataframe: ",df.shape)
df.head()

Error processing https://www.ecb.europa.eu/press/inter/date/2013/html/sp130916.en.html: 404 Client Error: Not Found for url: https://www.ecb.europa.eu/press/inter/date/2013/html/sp130916.en.html
Error processing https://www.ecb.europa.eu/press/inter/date/2014/html/sp141216.en.html: 404 Client Error: Not Found for url: https://www.ecb.europa.eu/press/inter/date/2014/html/sp141216.en.html
Error processing https://www.ecb.europa.eu/press/inter/date/2014/html/sp140629.en.html: 404 Client Error: Not Found for url: https://www.ecb.europa.eu/press/inter/date/2014/html/sp140629.en.html
Error processing https://www.ecb.europa.eu/press/inter/date/2015/html/sp150110_1.en.html: 404 Client Error: Not Found for url: https://www.ecb.europa.eu/press/inter/date/2015/html/sp150110_1.en.html
Error processing https://www.ecb.europa.eu/press/inter/date/2016/html/sp160505.en.html: 404 Client Error: Not Found for url: https://www.ecb.europa.eu/press/inter/date/2016/html/sp160505.en.html
Error processing http

Unnamed: 0,Date,Media,Member,Link,Information,Questions,Answers
0,23 December 2005,Interview with Der Spiegel,Jean-Claude Trichet,https://www.ecb.europa.eu/press/inter/date/200...,Information not found,"SPIEGEL: Monsieur Trichet, any concrete uttera...",The publication of the translation was authori...
1,19 December 2005,Interview with Hospodářské Noviny,Otmar Issing,https://www.ecb.europa.eu/press/inter/date/200...,Information not found,The new EU member states want to adopt the eur...,These questions are all closely related to eac...
2,19 December 2005,Interview with Financial Times and Financial T...,Lucas Papademos,https://www.ecb.europa.eu/press/inter/date/200...,Information not found,"Mr Papademos, you have responsibility as ECB V...",A comparison of the risks involved when short-...
3,15 December 2005,Interview with Paris Match,Jean-Claude Trichet,https://www.ecb.europa.eu/press/inter/date/200...,Information not found,Paris Match. After two uneventful years at the...,The publication of the translation was authori...
4,9 December 2005,Interview in Il Giornale,Lorenzo Bini Smaghi,https://www.ecb.europa.eu/press/inter/date/200...,Information not found,"However, Europe’s politicians, with few except...","By Angelo Allegri, our correspondent in Frankf..."


In [13]:
# Chack for NA values
df.isna().sum()

Date           0
Media          0
Member         0
Link           0
Information    0
Questions      0
Answers        0
dtype: int64

In [14]:
# Drop the rows with "Answers not found" in the "Answers" column
df = df[df['Answers'] != "Answers not found"]
print("Shape of the dataframe: ",df.shape)

Shape of the dataframe:  (519, 7)


In [15]:
# Save the complete dataframe
df.to_csv('data_complete.csv', index=False)