In [62]:
###############################################################################
##################### CODE FOR THE BCCP WEB SCRAPING COURSE ###################
############################## JUNE 24 TO 26, 2019 ############################
######################### SECTION ON BROWSER AUTOMATION #######################
###############################################################################

# Location to your browser driver and program file
browser_driver = \
    "C:/Users/kevin/Dropbox/Coding_Templates/Python/selenium/chromedriver.exe"
browser_app = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"

###############################################################################
############################## LOAD NEEDED MODULES ############################
###############################################################################

# Show everything in Jupyter notebooks (not just last result)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Load different functions/classes from selenium
# webdriver to start an instance of a webdriver
from selenium import webdriver
# Options to set Chrome options
from selenium.webdriver.chrome.options import Options
# BeautifulSoup to turn source code into navigable Python object
from bs4 import BeautifulSoup
# Pandas to convert to DataFrame
import pandas as pd
# ActionChains to interact with a website through selenium
from selenium.webdriver.common.action_chains import ActionChains


<h1>Approach</h1>
<ol>
    <li> Load events page
    <li> Loop through elements
        <ol>
            <li> Save date
            <li> Save events details
            <li> Click 'Next' button (If no 'Next' button, exit)
        </ol>
     <li> Turn to DataFrame and save
</ol>

<h2>1. Loading the events page</h2>
<a href="https://www.berlin-econ.de/events">https://www.berlin-econ.de/events</a>

In [64]:
###############################################################################
####################### 1. LOAD PAGE WITH FUTURE EVENTS #######################
###############################################################################
### Start a selenium instance
# Set Chrome options for webdriver
chrome_options = Options()
# Location of browser executable
chrome_options.binary_location = browser_app
### Some other options that might be useful:
## Window size
# chrome_options.add_argument("--window-size=1200,900")
## Headless (Does not show the browser)
# chrome_options.set_headless(headless=True)
# Start webdriver (need to include path to driver)

### Start the driver (this should open an empty browser windows)
driver = webdriver.Chrome(browser_driver, options = chrome_options)

In [65]:
# Load the events page
url = "https://www.berlin-econ.de/events"
driver.get(url)
# Get source code
html = driver.page_source
# Turn source code to soup
soup = BeautifulSoup(html, "lxml")

<h2>2. Loop through the results elements</h2>

In [66]:
# Take element with results
results = soup.find("div", class_ = "event-results")
# Loop through children
# Save in dictionary
resdict = {}
for el in results.children:
    # Skip white spaces
    try:
        if el.strip() == "":
            # continue loop with next iteration
            pass
    # Do something else if TypeError
    except TypeError:
        ### If there is a TypeError, the element should be one of three:
        # A date: <div class='event-date-separator'>
        # A list of events: <div class='ui segments'>
        # The page buttons: <div class='ui pagination menu'>
        
        # Take the element class to evaluate what type it is
        divclass = " ".join(el["class"])
        
        ### Date element
        if divclass == "event-date-separator":
            # Take the value in <span class='mobile hidden tablet hidden'>
            date = el.find("span", class_ = "mobile hidden tablet hidden") \
                .text.strip()
        ### Element containing all events for this date
        elif divclass == "ui segments":
            # Take list of events
            eventlist = el.find_all("div", class_ = "ui segment")
            # Loop through events and save
            for event in eventlist:
                # Location
                location = event.find("div", class_ = "ui red ribbon label") \
                    .text.strip()
                # Speaker (some don't have one)
                speaker = event.find("div", class_ = "speaker")
                if speaker != None:
                    speaker = speaker.text.strip()
                # Link and title
                link = event.find("div", class_ = "content").find("a")["href"] \
                    .strip()
                title = event.find("div", class_ = "content").find("a")["title"] \
                    .strip()
                # Other details (some don't have one)
                desc = event.find("div", class_ = "description mobile hidden") 
                if desc != None:
                    desc = desc.text.strip()
                
                # Event type
                evtype = event.find("div", class_ = "ui bottom right attached label") \
                    ["title"].strip()
                
                # Save in dict
                resdict[len(resdict)] = {
                    "date": date,
                    "location": location,
                    "speaker": speaker,
                    "link": link,
                    "title": title,
                    "desc": desc,
                    "evtype": evtype,
                }
        ### If arrived at buttons, press next
        elif divclass == "ui pagination menu":
            # This now requires selenium
            # Lets stop here the first time and see how to do it
            raise

TypeError: 'NoneType' object is not callable

In [67]:
resdict

{0: {'date': 'Monday, 17. June 2019',
  'location': 'DIW Berlin im Quartier 110',
  'speaker': None,
  'link': 'https://www.berlin-econ.de/event/workshop-for-women-in-macroeconomics-finance-and-economic-history',
  'title': 'Workshop for Women in Macroeconomics, Finance, and Economic History',
  'desc': 'The 1st annual Workshop for Women in Macroeconomics, Finance and Economic History is being organized by the DIW Berlin.\xa0The aim is to bring together',
  'evtype': 'Workshop'},
 1: {'date': 'Monday, 17. June 2019',
  'location': 'WZB Berlin',
  'speaker': 'Ricardo Alonso, LSE',
  'link': 'https://www.berlin-econ.de/event/to-be-announced-6',
  'title': 'Tampering with Information',
  'desc': None,
  'evtype': 'Berlin Micro Theory Seminar Series'},
 2: {'date': 'Monday, 17. June 2019',
  'location': 'HU Berlin',
  'speaker': 'Sandra McNally, University of Surrey and London School of Economics',
  'link': 'https://www.berlin-econ.de/event/to-be-announced-235',
  'title': 'Closing the Ga

In [73]:
### Find which one will be the next page
# Take the very last button and check if the contents contain "Next"
# If so, use the value of the "data-request-data" attribute of this
# tag
# If it does not contain "Next", then we just loaded the last page and we are done
next_content = el.find_all("a", class_ = "item")[-1]
if "Next" in next_content.text:
    next_page = next_content["data-request-data"]
else:
    pass
next_page

'page:2'

In [69]:
### Scroll the buttons into view
# Find the element for the next page in selenium using XPATH
xpathfind = "//div[@class='ui pagination menu']/" \
    "a[@data-request-data='%s']" % next_page
element = driver.find_element_by_xpath(xpathfind)
# Start ActionChain to control the browser
actions = ActionChains(driver)

In [70]:
# Scroll into view
actions.move_to_element(element).perform()

In [71]:
# Click on the button
actions.click(element).perform()

In [72]:
# Could also have done in one line:
# actions.move_to_element(element).click(element).perform()

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=75.0.3770.90)
  (Driver info: chromedriver=74.0.3729.6 (255758eccf3d244491b8a1317aa76e1ce10d57e9-refs/branch-heads/3729@{#29}),platform=Windows NT 10.0.17763 x86_64)


Now we can put all this inside a loop that runs until 
there is no more 'Next' button. <p>
<br>
Note that after each click and loading of the new content,
we need to save the source code and convert it into a soup again.