In [1]:
###############################################################################
##################### CODE FOR THE BCCP WEB SCRAPING COURSE ###################
############################## JUNE 24 TO 26, 2019 ############################
############################ SECTION ON HTML PARSING ##########################
###############################################################################

### Where to save file?
savefile = \
   "C:/Users/kevin/Documents/GitHub/web_scraping_course/results/bccp_events.csv"

###############################################################################
############################## LOAD NEEDED MODULES ############################
###############################################################################

# Show everything in Jupyter notebooks (not just last result)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# requests to load URLs
import requests
# BeautifulSoup to turn source code into navigable Python object
from bs4 import BeautifulSoup
# Pandas to convert to DataFrame
import pandas as pd


In [2]:
# 1. Load page with list of events
# 2. Find individual events in source code
# 3. Loop through events and save details available
# 4. Turn to DataFrame
# 5. Loop through events and load detailed event page 
    # to save additional information

In [3]:
###############################################################################
####################### 1. LOAD PAGE WITH FUTURE EVENTS #######################
###############################################################################

# URL to BCCP events page
url = "http://www.bccp-berlin.de/events/all-events"
# Load URL
r = requests.get(url)


In [4]:
# Can check if successful (Code 200 means it worked without errors)
r

<Response [200]>

In [5]:
# Get source code
srccode = r.text
# This is now a string containing the entire source code:
srccode

'<!DOCTYPE html>\n<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]-->\n<!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]-->\n<!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]-->\n<!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]-->\n<!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]-->\n<!--[if !IE]><!--> <html class="no-js" lang="de"><!--<![endif]-->\n<head>\n\n<meta charset="utf-8" />\n<!-- \n\tCPS-IT GmbH http://www.cps-it.de/\n\n\tThis website is powered by TYPO3 - inspiring people to share!\n\tTYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.\n\tTYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.\n\tInformation and contribution at http://typo3.org/\n-->\n\n<base href="http://www.bccp-berlin.de/" />\n<link rel="shortcut icon" href="/Tem

In [6]:
# Use BeautifulSoup 4 to turn into soup object that allows navigation
soup = BeautifulSoup(srccode, "lxml")
# Look at the soup object:
soup

<!DOCTYPE html>
<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]--><!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]--><!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]--><!--[if !IE]><!--><html class="no-js" lang="de"><!--<![endif]-->
<head>
<meta charset="utf-8"/>
<!-- 
	CPS-IT GmbH http://www.cps-it.de/

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
<base href="http://www.bccp-berlin.de/"/>
<link href="/Templates/Master/Resources/Public/Images/favicon.ico" rel="

In [7]:
# This object now has several useful properties and functions that we can use.
# The documentation is also very good: 
print("https://www.crummy.com/software/BeautifulSoup/bs4/doc/")

https://www.crummy.com/software/BeautifulSoup/bs4/doc/


In [8]:
###############################################################################
######################### 2. FIND INDIVIDUAL LISTINGS #########################
###############################################################################

# Search soup for all "div" tags whose "class" attribute 
# contains "event-list-item"
divs = soup.find_all("div", class_ = "event-list-item")
# Note normally, the attribute can just be used as an option
# e.g. if it where "id": soup.find_all("div", id = "event-list-item")
# but because "class" coindides with the Python class object, for class
# attributes, you have to use "class_"
# Alternatively: 
# divs = soup.find_all("div", attrs = {"class": "event-list-item"})

# This returns a list containing all div elements that match this structure
# Check the number of events:
len(divs)

25

In [9]:
# Take a look at the first element in the list
div = divs[0]
div

<div class="event-list-item event-type1">
<div class="top-bar">
<span class="date single">
						June 17, 2019
					</span>
<span class="b-events__item__type">Seminar</span>
</div>
<div class="b-events__item__inner">
<div class="content">
<div class="genres">
						
							Berlin Applied Micro Seminar
						
					</div>
<h2 class="eventHeader">
<a href="/events/all-events/events-detail/tba-27/">
					Sandra McNally (University of Surrey and LSE)
				</a>
</h2>
<div class="teaser">Closing the Gap Between Vocational and General Education? - Evidence from University Technical Colleges in England</div>
<div class="location">
<strong class="label">Location</strong>
<div class="address">
<span class="name">Humboldt-Universität zu Berlin</span>
<span class="address">Spandauer Str. 1, Room 22</span>
<span class="zip">10178</span>
<span class="place">Berlin</span>
</div>
</div>
<div class="time">
<strong class="label">Time</strong>
<span>16:00–17:15</span>
</div>
</div>
<div class="button detai

In [10]:
# Get details and save in dict
divdict = {}
# Date:
date = div.find("span", class_ = "date")
# Note: .find() only takes the first element that matches the structure
# If you are not sure that there is always a unique element that matches,
# a more conservative approach could be:
# Find all elements that fit the search
date = div.find_all("span", class_ = "date")
# Make sure the result is unique
assert len(date) == 1, "%d results for //span[@class='date']" \
    % len(date)
# If there is not exactly one result, this will raise an Exception.
# If it is a unique result, the code continues:
# Take the result (remember the .find_all() yields a list of result)
date = date[0].text.strip()

# Save in dict
divdict["date"] = date

# Look at the date element:
date

'June 17, 2019'

In [11]:
# Save event type
evtype = div.find("span", class_ = "b-events__item__type").text.strip()
divdict["event_type"] = evtype
evtype

'Seminar'

In [12]:
# Save seminar series
series = div.find("div", class_ = "genres").text.strip()
divdict["event_series"] = series
series

'Berlin Applied Micro Seminar'

In [13]:
# Save URL and title
header = div.find("h2", class_ = "eventHeader")
header

<h2 class="eventHeader">
<a href="/events/all-events/events-detail/tba-27/">
					Sandra McNally (University of Surrey and LSE)
				</a>
</h2>

In [14]:
# Get URL
# Note that the URL is not saved as content but as the value of the "href" attribute
url = header.find("a")["href"]
url 
# Add base URL
url = "http://www.bccp-berlin.de" + url
url

'/events/all-events/events-detail/tba-27/'

'http://www.bccp-berlin.de/events/all-events/events-detail/tba-27/'

In [15]:
# Get title
title = header.text.strip()
title
# Save both
divdict["url"] = url
divdict["title"] = title

'Sandra McNally (University of Surrey and LSE)'

In [16]:
# Save topic
topic = div.find("div", class_ = "teaser").text.strip()
topic
divdict["topic"] = topic

'Closing the Gap Between Vocational and General Education? - Evidence from University Technical Colleges in England'

In [17]:
# Get address
addtag = div.find("div", class_ = "address")
addtag

<div class="address">
<span class="name">Humboldt-Universität zu Berlin</span>
<span class="address">Spandauer Str. 1, Room 22</span>
<span class="zip">10178</span>
<span class="place">Berlin</span>
</div>

In [18]:
# Loop through sub "span" elements and save
for span in addtag.find_all("span"):
    # Take content
    val = span.text
    # Use class as variable name (add prefix)
    varname = "loc_" + span["class"][0]
    divdict[varname] = val
divdict
# Note: Approaching it like this automates the creation of the variable
# and makes the code more flexible (e.g. if there are different address
# elements in different events)
# On the other hand, this flexibility increases the probability that
# errors are missed (e.g. if a listing was not correctly loaded,
# this might just create no variable and go on)

{'date': 'June 17, 2019',
 'event_type': 'Seminar',
 'event_series': 'Berlin Applied Micro Seminar',
 'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/tba-27/',
 'title': 'Sandra McNally (University of Surrey and LSE)',
 'topic': 'Closing the Gap Between Vocational and General Education? - Evidence from University Technical Colleges in England',
 'loc_name': 'Humboldt-Universität zu Berlin',
 'loc_address': 'Spandauer Str. 1, Room 22',
 'loc_zip': '10178',
 'loc_place': 'Berlin'}

In [19]:
# Get time
time = div.find("div", class_ = "time").find("span").text.strip()
time
divdict["time"] = time

'16:00–17:15'

In [20]:
# Look at divdict
divdict

{'date': 'June 17, 2019',
 'event_type': 'Seminar',
 'event_series': 'Berlin Applied Micro Seminar',
 'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/tba-27/',
 'title': 'Sandra McNally (University of Surrey and LSE)',
 'topic': 'Closing the Gap Between Vocational and General Education? - Evidence from University Technical Colleges in England',
 'loc_name': 'Humboldt-Universität zu Berlin',
 'loc_address': 'Spandauer Str. 1, Room 22',
 'loc_zip': '10178',
 'loc_place': 'Berlin',
 'time': '16:00–17:15'}

In [21]:
###############################################################################
########################### 3. LOOP THROUGH LISTINGS ##########################
###############################################################################

# Now put this inside a loop to loop through all events
# Loop through events and save details in dictionary
resdict = {}
for div in divs:
    
    # Get details and save in dict
    divdict = {}
    
    # Date:
    date = div.find_all("span", class_ = "date")
    # Make sure the result is unique
    assert len(date) == 1, "%d results for //span[@class='date single']" \
        % len(date)
    # If there is not exactly one result, this will raise an Exception.
    # If it is a unique result, the code continues:
    # Take the result (remember the .find_all() yields a list of result)
    date = date[0].text.strip()
    # Save in dict
    divdict["date"] = date

    # Save event type
    evtype = div.find("span", class_ = "b-events__item__type").text.strip()
    divdict["event_type"] = evtype

    # Save seminar series
    series = div.find("div", class_ = "genres").text.strip()
    divdict["event_series"] = series

    # Save URL and title
    header = div.find("h2", class_ = "eventHeader")
    # Get URL
    # Note that the URL is not saved as content but as the value of the "href" attribute
    url = header.find("a")["href"] 
    # Add base URL
    url = "http://www.bccp-berlin.de" + url

    # Get title
    title = header.text.strip()

    # Save both
    divdict["url"] = url
    divdict["title"] = title

    # Save topic
    topic = div.find("div", class_ = "teaser").text.strip()
    divdict["topic"] = topic

    # Get address
    addtag = div.find("div", class_ = "address")

    # Loop through sub "span" elements and save
    for span in addtag.find_all("span"):
        # Take content
        val = span.text
        # Use class as variable name (add prefix)
        varname = "loc_" + span["class"][0]
        divdict[varname] = val

    # Get time
    time = div.find("div", class_ = "time").find("span").text.strip()
    divdict["time"] = time
    
    # Save as new entry in resdict
    resdict[len(resdict)] = divdict

# Look at resdict
resdict

{0: {'date': 'June 17, 2019',
  'event_type': 'Seminar',
  'event_series': 'Berlin Applied Micro Seminar',
  'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/tba-27/',
  'title': 'Sandra McNally (University of Surrey and LSE)',
  'topic': 'Closing the Gap Between Vocational and General Education? - Evidence from University Technical Colleges in England',
  'loc_name': 'Humboldt-Universität zu Berlin',
  'loc_address': 'Spandauer Str. 1, Room 22',
  'loc_zip': '10178',
  'loc_place': 'Berlin',
  'time': '16:00–17:15'},
 1: {'date': 'June 17, 2019',
  'event_type': 'Seminar',
  'event_series': 'Berlin Micro Theory Seminar',
  'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/ricardo-alonso-london-school-of-economics-1/',
  'title': 'Ricardo Alonso (London School of Economics)',
  'topic': 'Tampering with Information',
  'loc_name': 'WZB',
  'loc_address': 'Reichpietschufer 50, Room B001',
  'loc_zip': '10785',
  'loc_place': 'Berlin',
  'time': '17:15–18:

In [22]:
###############################################################################
############################# 4. TURN TO DATAFRAME ############################
###############################################################################

# Turn to dataframe and transpose
df = pd.DataFrame(resdict).T
# Look at df
df

Unnamed: 0,date,event_series,event_type,loc_address,loc_name,loc_place,loc_zip,time,title,topic,url
0,"June 17, 2019",Berlin Applied Micro Seminar,Seminar,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,16:00–17:15,Sandra McNally (University of Surrey and LSE),Closing the Gap Between Vocational and General...,http://www.bccp-berlin.de/events/all-events/ev...
1,"June 17, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Ricardo Alonso (London School of Economics),Tampering with Information,http://www.bccp-berlin.de/events/all-events/ev...
2,"June 20, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B 002/003",WZB,Berlin,10785.0,16:45–18:00,Martin Sefton (University of Nottingham),Communication with partially verifiable inform...,http://www.bccp-berlin.de/events/all-events/ev...
3,"June 21, 2019",BCCP Conference,Conference & Events,"Reichpietschufer 50, Room A300",WZB,Berlin,10785.0,09:50–16:30,BCCP Conference and Policy Forum 2019,Regulatory Challenges in Digital Markets: the ...,http://www.bccp-berlin.de/events/all-events/ev...
4,"June 24, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Matthias Lang (LMU Munich),Bilateral Trade with Justification,http://www.bccp-berlin.de/events/all-events/ev...
5,"June 24, 2019 - June 26, 2019",Other events,Conference & Events,"Mohrenstr. 58, Room Anna J. Schwartz",DIW,Berlin,10117.0,09:30–12:30,DIW Graduate Center/BCCP Short Course on Web S...,,http://www.bccp-berlin.de/events/all-events/ev...
6,"June 27, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,16:45–18:00,Felix Holzmeister (University of Innsbruck),Delegated decision making in finance,http://www.bccp-berlin.de/events/all-events/ev...
7,"July 01, 2019",Berlin Applied Micro Seminar,Seminar,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,16:00–17:15,Bettina Siflinger (Tilburg University),TBA,http://www.bccp-berlin.de/events/all-events/ev...
8,"July 01, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Ariel Rubinstein (Tel Aviv University),TBA,http://www.bccp-berlin.de/events/all-events/ev...
9,"July 08, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Antonio Rosato (UT Sydney),TBA,http://www.bccp-berlin.de/events/all-events/ev...


In [23]:
###############################################################################
######################### 5. LOAD EVENT DETAIL PAGES ##########################
###############################################################################

# Load url and turn to soup
url = df["url"].values[0]

r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")

In [24]:
soup

<!DOCTYPE html>
<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]--><!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]--><!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]--><!--[if !IE]><!--><html class="no-js" lang="de"><!--<![endif]-->
<head>
<meta charset="utf-8"/>
<!-- 
	CPS-IT GmbH http://www.cps-it.de/

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
<base href="http://www.bccp-berlin.de/"/>
<link href="/Templates/Master/Resources/Public/Images/favicon.ico" rel="

In [25]:
# Take contents of infobox
# Make it flexible, so we capture variouse types of fields
infobox = soup.find("div", class_ = "info-box")

# Save in dictionary
event_dict = {}
for child in infobox.children:
    ##### Skip if whitespace
    ### Note: strip() removed leading and trailing whitespaces
    # The advantage over e.g. child == "" here is that strip()
    # also capture other white spaces (e.g. tabs, line breaks)
    ### Further note: If child is not a string, the following if condition
    # yields a TypeError. Catching the exception here helps.
    try:
        if child.strip() == "":
            # continue loop with next iteration
            pass
    # Do something else if TypeError
    except TypeError:
        # The element now should have a "class" attribute
        
        ### Special case for the "location" element: Take the children of it
        if child["class"][0] == "location":
            # Take all span elements with address info
            spans = child.find("div", class_ = "address").find_all("span")
            # Loop and save
            for span in spans:
                # Create variable name using class name
                varname = "address_" + span["class"][0]
                # Get value
                # Note: This removes multiple whitespaces and replaces them
                # by a single space
                value = " ".join(span.text.strip().split())
                # Save
                event_dict[varname] = value
        # Take the class and check if it is called "label". If so, skip it
        elif child["class"][0] == "label":
            pass
        # Else, take the class as variable name and take the contents
        # as values
        else:
            varname = child["class"][0]
            value = " ".join(child.text.strip().split())
            event_dict[varname] = value
# Look at the event_dict
event_dict

{'date': 'June 17, 2019',
 'time': 'Time 16:00–17:15',
 'address_name': 'Humboldt-Universität zu Berlin',
 'address_address': 'Spandauer Str. 1, Room 22',
 'address_zip': '10178',
 'address_place': 'Berlin'}

In [26]:
# Next, take contents of main page
cont = soup.find("div", class_ = "content") \
    .find("div", class_ = "description")
# Loop through children
for child in cont.children:
    ##### Skip if whitespace
    ### Note: strip() removed leading and trailing whitespaces
    # The advantage over e.g. child == "" here is that strip()
    # also capture other white spaces (e.g. tabs, line breaks)
    ### Further note: If child is not a string, the following if condition
    # yields a TypeError. Catching the exception here helps.
    try:
        if child.strip() == "":
            # continue loop with next iteration
            pass
    # Do something else if TypeError
    except TypeError:
        # The element now should have a "class" attribute
        
        ### Special case for the "location" element: Take the children of it
        if child["class"][0] == "label":
            pass
        # Else, take the class as variable name and take the contents
        # as values
        else:
            varname = child["class"][0]
            value = " ".join(child.text.strip().split())
            event_dict[varname] = value
            # Check if the element contains links, if so save
            links = child.find_all("a")
            # Loop through links and save
            for link in links:
                # Take content in lowercase and with underscore as variable name
                varname = "_".join(link.text.strip().lower().split())
                # URL
                url = link["href"]
                # Save
                event_dict[varname] = url

In [27]:

# Look at the event_dict
event_dict

{'date': 'June 17, 2019',
 'time': 'Time 16:00–17:15',
 'address_name': 'Humboldt-Universität zu Berlin',
 'address_address': 'Spandauer Str. 1, Room 22',
 'address_zip': '10178',
 'address_place': 'Berlin',
 'headline--desktop': 'Sandra McNally (University of Surrey and LSE)',
 'teaser': 'Topic:Closing the Gap Between Vocational and General Education? - Evidence from University Technical Colleges in England',
 'description__bodytext': 'Go to speaker website.',
 'speaker_website': 'https://www.surrey.ac.uk/people/sandra-mcnally'}

In [28]:
### Now do this for all URLs
# Save in large dictionary
resdict = {}
for url in df["url"].values:
    
    # Message to let us know where we are
    print("Loading %s" % url)

    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    
    # Take contents of infobox
    # Make it flexible, so we capture variouse types of fields
    infobox = soup.find("div", class_ = "info-box")

    # Save in dictionary
    event_dict = {}
    for child in infobox.children:
        ##### Skip if whitespace
        ### Note: strip() removed leading and trailing whitespaces
        # The advantage over e.g. child == "" here is that strip()
        # also capture other white spaces (e.g. tabs, line breaks)
        ### Further note: If child is not a string, the following if condition
        # yields a TypeError. Catching the exception here helps.
        try:
            if child.strip() == "":
                # continue loop with next iteration
                pass
        # Do something else if TypeError
        except TypeError:
            # The element now should have a "class" attribute

            ### Special case for the "location" element: Take the children of it
            if child["class"][0] == "location":
                # Take all span elements with address info
                spans = child.find("div", class_ = "address").find_all("span")
                # Loop and save
                for span in spans:
                    # Create variable name using class name
                    varname = "address_" + span["class"][0]
                    # Get value
                    # Note: This removes multiple whitespaces and replaces them
                    # by a single space
                    value = " ".join(span.text.strip().split())
                    # Save
                    event_dict[varname] = value
            # Take the class and check if it is called "label". If so, skip it
            elif child["class"][0] == "label":
                pass
            # Else, take the class as variable name and take the contents
            # as values
            else:
                varname = child["class"][0]
                value = " ".join(child.text.strip().split())
                event_dict[varname] = value
    # Next, take contents of main page
    cont = soup.find("div", class_ = "content") \
        .find("div", class_ = "description")
    # Loop through children
    for child in cont.children:
        ##### Skip if whitespace
        ### Note: strip() removed leading and trailing whitespaces
        # The advantage over e.g. child == "" here is that strip()
        # also capture other white spaces (e.g. tabs, line breaks)
        ### Further note: If child is not a string, the following if condition
        # yields a TypeError. Catching the exception here helps.
        try:
            if child.strip() == "":
                # continue loop with next iteration
                pass
        # Do something else if TypeError
        except TypeError:
            # The element now should have a "class" attribute

            ### Special case for the "location" element: Take the children of it
            if child["class"][0] == "label":
                pass
            # Else, take the class as variable name and take the contents
            # as values
            else:
                varname = child["class"][0]
                value = " ".join(child.text.strip().split())
                event_dict[varname] = value
                # Check if the element contains links, if so save
                links = child.find_all("a")
                # Loop through links and save
                for link in links:
                    # Take content in lowercase and with underscore as variable name
                    varname = "link_" + "_".join(link.text.strip().lower().split())
                    # URL
                    linkurl = link["href"]
                    # Save
                    event_dict[varname] = linkurl
                    
    # Save in resdict and use url as index (we use this later to merge to
    # previously created df)
    resdict[url] = event_dict
# Create DataFrame
df_details = pd.DataFrame(resdict).T
# Look at it
df_details

Loading http://www.bccp-berlin.de/events/all-events/events-detail/tba-27/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/ricardo-alonso-london-school-of-economics-1/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/martin-sefton-university-of-nottingham/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/bccp-conference-and-policy-forum-2019/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/matthias-lang-lmu-munich-1/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/felix-holzmeister-university-of-innsbruck/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/bettina-siflinger-tilburg-university/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/ariel-rubinstein-tel-aviv-university-2/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/antonio-rosato-ut-sydney-1/
Lo

Unnamed: 0,address_address,address_name,address_place,address_zip,b-events__multi-performance,b-events__multi-performance__label,date,description__bodytext,headline--desktop,link_amelia_fletcher,...,link_marit_hansen,link_matthew_gentzkow,link_paul_nemitz,link_speaker_website,link_stefan_hunt,link_tomaso_duso,link_twitter_feed:_#bccpconf,performance__details,teaser,time
http://www.bccp-berlin.de/events/all-events/events-detail/tba-27/,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,,,"June 17, 2019",Go to speaker website.,Sandra McNally (University of Surrey and LSE),,...,,,,https://www.surrey.ac.uk/people/sandra-mcnally,,,,,Topic:Closing the Gap Between Vocational and G...,Time 16:00–17:15
http://www.bccp-berlin.de/events/all-events/events-detail/ricardo-alonso-london-school-of-economics-1/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"June 17, 2019",Go to speaker website.,Ricardo Alonso (London School of Economics),,...,,,,https://sites.google.com/site/ricardoalonsoweb...,,,,,Topic:Tampering with Information,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/martin-sefton-university-of-nottingham/,"Reichpietschufer 50, Room B 002/003",WZB,Berlin,10785.0,,,"June 20, 2019",Go to speaker website.,Martin Sefton (University of Nottingham),,...,,,,https://www.nottingham.ac.uk/economics/people/...,,,,,Topic:Communication with partially verifiable ...,Time 16:45–18:00
http://www.bccp-berlin.de/events/all-events/events-detail/bccp-conference-and-policy-forum-2019/,"Reichpietschufer 50, Room A300",WZB,Berlin,10785.0,,,"June 21, 2019",The tremendous growth of digital transactions ...,BCCP Conference and Policy Forum 2019,https://people.uea.ac.uk/amelia_fletcher,...,https://www.hansen-kronshagen.de/marit/,https://gentzkow.people.stanford.edu/,https://www.coleurope.eu/whoswho/person/paul.n...,,https://competitionandmarkets.blog.gov.uk/auth...,https://sites.google.com/site/tomasoduso/home,https://twitter.com/hashtag/bccpconf?f=tweets&...,,Topic:Regulatory Challenges in Digital Markets...,Time 09:50–16:30
http://www.bccp-berlin.de/events/all-events/events-detail/matthias-lang-lmu-munich-1/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"June 24, 2019",Go to speaker website.,Matthias Lang (LMU Munich),,...,,,,https://lang.userweb.mwn.de/,,,,,Topic:Bilateral Trade with Justification,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/,,,,,"Dates June 24, 2019 - June 26, 2019",Date Details,,This short course by BCCP Doctoral Students Ju...,DIW Graduate Center/BCCP Short Course on Web S...,,...,,,,,,,,"June 24, 2019 Open Details Time 09:30–12:30 Lo...",,
http://www.bccp-berlin.de/events/all-events/events-detail/felix-holzmeister-university-of-innsbruck/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"June 27, 2019",Go to speaker website.,Felix Holzmeister (University of Innsbruck),,...,,,,https://www.holzmeister.biz/,,,,,Topic:Delegated decision making in finance,Time 16:45–18:00
http://www.bccp-berlin.de/events/all-events/events-detail/bettina-siflinger-tilburg-university/,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,,,"July 01, 2019",Go to speaker website.,Bettina Siflinger (Tilburg University),,...,,,,https://sites.google.com/site/bettinasiflinger/,,,,,Topic:TBA,Time 16:00–17:15
http://www.bccp-berlin.de/events/all-events/events-detail/ariel-rubinstein-tel-aviv-university-2/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"July 01, 2019",Go to speaker website.,Ariel Rubinstein (Tel Aviv University),,...,,,,https://en-social-sciences.tau.ac.il/profile/r...,,,,,Topic:TBA,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/antonio-rosato-ut-sydney-1/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"July 08, 2019",Go to speaker website.,Antonio Rosato (UT Sydney),,...,,,,https://sites.google.com/site/rosatoeconomics/...,,,,,Topic:TBA,Time 17:15–18:30


In [29]:
### Merge back to df
# Use "url" for df and index for df_details
# Note: The suffixes attribute adds a "_details" to all variables
# from df_details whose name already exists in df (e.g. date)
df = df.merge(df_details, left_on = "url", right_index = True, \
    validate = "1:1", how = "left", suffixes = ("", "_details"))

# Look at it
df

Unnamed: 0,date,event_series,event_type,loc_address,loc_name,loc_place,loc_zip,time,title,topic,...,link_marit_hansen,link_matthew_gentzkow,link_paul_nemitz,link_speaker_website,link_stefan_hunt,link_tomaso_duso,link_twitter_feed:_#bccpconf,performance__details,teaser,time_details
0,"June 17, 2019",Berlin Applied Micro Seminar,Seminar,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,16:00–17:15,Sandra McNally (University of Surrey and LSE),Closing the Gap Between Vocational and General...,...,,,,https://www.surrey.ac.uk/people/sandra-mcnally,,,,,Topic:Closing the Gap Between Vocational and G...,Time 16:00–17:15
1,"June 17, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Ricardo Alonso (London School of Economics),Tampering with Information,...,,,,https://sites.google.com/site/ricardoalonsoweb...,,,,,Topic:Tampering with Information,Time 17:15–18:30
2,"June 20, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B 002/003",WZB,Berlin,10785.0,16:45–18:00,Martin Sefton (University of Nottingham),Communication with partially verifiable inform...,...,,,,https://www.nottingham.ac.uk/economics/people/...,,,,,Topic:Communication with partially verifiable ...,Time 16:45–18:00
3,"June 21, 2019",BCCP Conference,Conference & Events,"Reichpietschufer 50, Room A300",WZB,Berlin,10785.0,09:50–16:30,BCCP Conference and Policy Forum 2019,Regulatory Challenges in Digital Markets: the ...,...,https://www.hansen-kronshagen.de/marit/,https://gentzkow.people.stanford.edu/,https://www.coleurope.eu/whoswho/person/paul.n...,,https://competitionandmarkets.blog.gov.uk/auth...,https://sites.google.com/site/tomasoduso/home,https://twitter.com/hashtag/bccpconf?f=tweets&...,,Topic:Regulatory Challenges in Digital Markets...,Time 09:50–16:30
4,"June 24, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Matthias Lang (LMU Munich),Bilateral Trade with Justification,...,,,,https://lang.userweb.mwn.de/,,,,,Topic:Bilateral Trade with Justification,Time 17:15–18:30
5,"June 24, 2019 - June 26, 2019",Other events,Conference & Events,"Mohrenstr. 58, Room Anna J. Schwartz",DIW,Berlin,10117.0,09:30–12:30,DIW Graduate Center/BCCP Short Course on Web S...,,...,,,,,,,,"June 24, 2019 Open Details Time 09:30–12:30 Lo...",,
6,"June 27, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,16:45–18:00,Felix Holzmeister (University of Innsbruck),Delegated decision making in finance,...,,,,https://www.holzmeister.biz/,,,,,Topic:Delegated decision making in finance,Time 16:45–18:00
7,"July 01, 2019",Berlin Applied Micro Seminar,Seminar,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,16:00–17:15,Bettina Siflinger (Tilburg University),TBA,...,,,,https://sites.google.com/site/bettinasiflinger/,,,,,Topic:TBA,Time 16:00–17:15
8,"July 01, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Ariel Rubinstein (Tel Aviv University),TBA,...,,,,https://en-social-sciences.tau.ac.il/profile/r...,,,,,Topic:TBA,Time 17:15–18:30
9,"July 08, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Antonio Rosato (UT Sydney),TBA,...,,,,https://sites.google.com/site/rosatoeconomics/...,,,,,Topic:TBA,Time 17:15–18:30


In [30]:
###############################################################################
################################# 6. SAVE CSV #################################
###############################################################################

# Save csv
df.to_csv(savefile, sep = ";", encoding = "utf-8-sig")