# Scraping Barcelona cultural agenda


***
<br/>

This code scrapes Barcelona's agenda website (https://guia.barcelona.cat/ca) and gets the main information of each event for a specified set of time.

calling the main function:

>scrape_agenda(agenda,ndays,nr,threading = True):

>agenda: _dataframe_ with the columns as the agenda.csv file explained below.
>
>ndays: _integer_ starting from current date, number of days you want to retrieve the agenda for.
>
>nr: _integer_ number of events per day
>
>threading: True concurrency enabled. False otherwise.

>returns: _dataframe_  with the detailed information of each event for the ndays after current date with the same    format as agenda   

The returned data frame is stored as "agenda.csv" file with the following attributes:
 
 - Event_Name: Title of the event
 - Starting_from: Starting date or Permanent event
 - Ending: Ending date or "-" if Permanent event or day event
 - Location: Venue
 - Address: Address of the venue
 - Description: Short description of the event
 - Link: Link to the specific event page from which you can get more information.

In [1]:
# Packages

import time
import datetime as dt
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor

In [2]:
############################################# Initialization #############################################

#Guia Barcelona Website ordered by popularity
website = "https://guia.barcelona.cat/ca/llistat?pg=search&cerca=*:*&tr=619&af=code_prop&c=00619*&dt=2018-11-02,2018-11-02&nr=10&sort=popularity,desc"

#Number of selected days 
ndays = 7

#Number of events per day [10,15,20,25...] only multiple of 5 allowed
nr = 15

#Defining agenda format
agenda = pd.DataFrame(columns = ["Event_Name","Starting_from","Ending","Location","Address","Description","Link"])

The url from which the code starts is a random selected day agenda whose results are ordered by popularity. This url is then customized for the selected set of days and the number of events per day.

By default, number of days for which we extract data is 7 and number of events per day is 15 but this can be customized.

In [3]:
############################################# Functions #############################################


def get_period(ndays = 7):
    """
    ndays is an integer meaning the number of days you want to get the agenda for.
    returns a list of ndays dates as string with format 'YYYY-MM-DD'.
    """
    period = []
    for i in range(ndays):
        period.append((dt.date.today() + dt.timedelta(days =i)).isoformat())
    return period


#---------------------------------------------#

def geturl_day (day,nr = 15):
    """
    day is a string with format 'DD-MM-YYYY'.
    nr is an integer defining how many evens we want per day.
    returns the url of barcelona agenda webpage for the especific day as string.    
    """
    webparams = website.split("&")
    webparams[5] = "dt=" + day + "," + day
    webparams[6] = "nr=" + str(nr)
    return "&".join(webparams)


#---------------------------------------------#

# Get period urls
def get_period_urls (period,nr):
    """ 
    period is a list of strings with format 'DD-MM-YYYY'.
    returns a list of urls for each day/date within the period. 
    """
    period_urls = []
    for day in period:
        period_urls.append(geturl_day(day,nr))
    
    return period_urls
        
#---------------------------------------------#
    
def get_parsed_html(url):    
    """ returns the parsed html url """
    r = requests.get(url)
    return BeautifulSoup(r.text, 'html.parser')

#---------------------------------------------#

def find_dates (any_str):
    """
    any_str is a string 
    returns a list of strings with format DD/MM/YYY
    """
    return re.findall(r'\d{2}/\d{2}/\d{4}',any_str)


#---------------------------------------------#


def get_description_link (item):
    """
    item is a b4s object containing all the information of one event
    returns the link to the webpage with futher information and a short description of the event
    """
    desc_url = "https://guia.barcelona.cat/ca/"+item.h3.a['href']
    soup_desc = get_parsed_html(desc_url)
    item_desc = soup_desc.select(".entradeta")
    return desc_url , item_desc[0].get_text()


#---------------------------------------------#

def get_event (item):
    """
    item is a b4s object containing all the information of one event
    returns a dictionary with event information:  Event_Name, Location, Starting_from date, Ending date, 
                                                  Address, Description ,Link
    """
    event = {}
    event["Event_Name"] =  item.h3.a.get_text()
    
    # The html code for event details is coded as a html description list.
    # Depending on the list term, the discription is retrieved from next sibling.
    for sibling in item.dl.children:
        if sibling.get_text() == "On:":
            event["Location"] = sibling.next_sibling.get_text()
        elif sibling.get_text() == "Quan:":
            dates = find_dates(sibling.next_sibling.get_text())
            if len(dates) == 1:
                event["Starting_from"] = dates[0]
                event["Ending"] = "-"
            elif len(dates) == 2:
                event["Starting_from"] = dates[0]
                event["Ending"] = dates[1]
            else:
                event["Starting_from"] = sibling.next_sibling.get_text()
                event["Ending"] = "-"
        elif sibling.get_text() == "Adreça:":
            event["Address"] = sibling.next_sibling.get_text()
        else:
            pass
        # The link and Description of the event are retrieved from a subpage 
        event["Link"],event["Description"] = get_description_link(item)
    
    return event


#---------------------------------------------#


def event_in_agenda (agenda,event):
    """
    agenda is a data frame with columns Event_Name, Location, Starting_from date, Ending date, 
                                                  Address, Description ,Link
    event is a dictionary with event information: Event_Name, Location, Starting_from date, Ending date, 
                                                  Address, Description ,Link
    returns true if the event already in the agenda.
    """
    if event["Event_Name"] in agenda.Event_Name.values:
        return True
    else:
        return False

#---------------------------------------------#
    
def add_day_agenda(agenda,url):
    """
    agenda is a data frame with columns Event_Name, Location, Starting_from date, Ending date, Address and description
    url from which we extract agenda of a specific day
    returns the agenda with the new events included if the event is not in the df yet. Returns the agenda as is otherwise.
    """
    
    soup = get_parsed_html(url)
    items = soup.select(".dades")
    
    # Implementing concurrent threads for getting event information.
    # get_event function takes some information from da agenda url and goes to the detailed url for each event
    # to retrieve further event data.
    # The html requests for each event have a lot of waiting time involved, that's the reason 
    # concurrent thread was implemented.
    
    with ThreadPoolExecutor() as pool:
        evs = list(pool.map(get_event, items))
    
    # Adds the event to the agenda df if event is not already there.    
    for ev in evs:     
        if not event_in_agenda(agenda,ev) :
            agenda = agenda.append(other = ev,ignore_index = True)
    return agenda

def add_day_agenda_wo_scraping(agenda,url):
    """
    same function as the above one but without threading implemented
    """
    soup = get_parsed_html(url)
    items = soup.select(".dades")
                        
    for item in items:
        ev = get_event(item)
        if not event_in_agenda(agenda,ev):
            agenda = agenda.append(other = ev,ignore_index = True)
    return agenda

def scrape_agenda(agenda,ndays,nr,threading = True):
    """
    agenda dataframe 
    ndays integer number of days
    nr integer number of events per day
    returns an agenda dframe with the detailed information of each event for the ndays after current date.    
    """
    urls = get_period_urls(get_period(ndays),nr)
    
    #Threading enabled
    if threading :
        for url in urls:
            agenda = add_day_agenda(agenda,url)
    
    #Threading not enabled
    else:
        for url in urls:
            agenda = add_day_agenda_wo_scraping(agenda,url)
    return agenda

In order to compare algorithm performance with / with out threading, is has been implemented the option to enable/disable threading when calling the main function.

Below, as an example, can be found the comparison in time between the two.

In [4]:
# Scraping agenda Barcelona with threading
start_time = time.time()

agenda = scrape_agenda(agenda,ndays,nr,threading = True)
    
end_time = time.time()

print("Process time: ", round(end_time-start_time),  " seconds")

agenda.to_csv("./agenda.csv")

Process time:  60  seconds


In [6]:
# Scraping agenda Barcelona without threading
agenda = pd.DataFrame(columns = ["Event_Name","Starting_from","Ending","Location","Address","Description","Link"])

start_time = time.time()

agenda = scrape_agenda(agenda,ndays,nr,threading = False)
    
end_time = time.time()

print("Process time: ", round(end_time-start_time),  " seconds")

agenda.to_csv("./agenda.csv")

Process time:  229  seconds
