# Barcelona_agenda_sc

code

In [8]:
import datetime as dt
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor

In [13]:
### Initialization ###

#Guia Barcelona Website ordered by popularity
website = "https://guia.barcelona.cat/ca/llistat?pg=search&cerca=*:*&tr=619&af=code_prop&c=00619*&dt=2018-11-02,2018-11-02&nr=10&sort=popularity,desc"

#Selecting how many days do I want to scrape agenda
ndays = 2

#Defining agenda format
agenda = pd.DataFrame(columns = ["Event_Name","Starting_from","Ending","Location","Adress","Description","Link"])

In [14]:
def get_period(ndays):
    """
    ndays is an integer meaning the number of days you want to get the agenda for.
    returns a list of ndays dates as string with format 'YYYY-MM-DD'.
    """
    period = []
    for i in range(ndays):
        period.append((dt.date.today() + dt.timedelta(days =i)).isoformat())
    return period


#---------------------------------------------#

def geturl_day (day):
    """
    day is a string with format 'YYYY-MM-DD'
    returns the url of barcelona agenda webpage for the especific day as string.    
    """
    webparams = website.split("&")
    webparams[5] = "dt=" + day + "," + day
    return "&".join(webparams)


#---------------------------------------------#

# Get period urls
def get_period_urls (period):
    """ 
    period is a list of strings with format 'YYYY-MM-DD'
    returns a list of urls for each day/date within the period 
    """
    period_urls = []
    for day in period:
        period_urls.append(geturl_day(day))
    
    return period_urls
        
#---------------------------------------------#
    
def get_parsed_html(url):    
    """ returns the parsed html url """
    r = requests.get(url)
    return BeautifulSoup(r.text, 'html.parser')

#---------------------------------------------#

def find_dates (any_str):
    """
    any_str is a string 
    returns a list of strings with format DD/MM/YYY
    """
    return re.findall(r'\d{2}/\d{2}/\d{4}',any_str)


#---------------------------------------------#


def get_description_link (item):
    """
    item is a b4s object containing all the information of one event
    returns the link to the webpage with futher information and a short description of the event
    """
    desc_url = "https://guia.barcelona.cat/ca/"+item.h3.a['href']
    soup_desc = get_parsed_html(desc_url)
    item_desc = soup_desc.select(".entradeta")
    return desc_url , item_desc[0].get_text()


#---------------------------------------------#

def get_event (item):
    """
    item is a b4s object containing all the information of one event
    returns a dictionary with event information:  Event_Name, Location, Starting_from date, Ending date, 
                                                  Adress, Description ,Link
    """
    event = {}
    event["Event_Name"] =  item.h3.a.get_text()
    
    for sibling in item.dl.children:
        if sibling.get_text() == "On:":
            event["Location"] = sibling.next_sibling.get_text()
        elif sibling.get_text() == "Quan:":
            dates = find_dates(sibling.next_sibling.get_text())
            if len(dates) == 1:
                event["Starting_from"] = dates[0]
                event["Ending"] = "-"
            elif len(dates) == 2:
                event["Starting_from"] = dates[0]
                event["Ending"] = dates[1]
            else:
                event["Starting_from"] = sibling.next_sibling.get_text()
                event["Ending"] = "-"
        elif sibling.get_text() == "Adreça:":
            event["Adress"] = sibling.next_sibling.get_text()
        else:
            pass
        
        event["Link"],event["Description"] = get_description_link(item)
    
    return event


#---------------------------------------------#


def event_in_agenda (agenda,event):
    """
    agenda is a data frame with columns Event_Name, Location, Starting_from date, Ending date, 
                                                  Adress, Description ,Link
    event is a dictionary with event information: Event_Name, Location, Starting_from date, Ending date, 
                                                  Adress, Description ,Link
    returns true if the event already in the agenda.
    """
    if event["Event_Name"] in agenda.Event_Name.values:
        return True
    else:
        return False

#---------------------------------------------#
    
def add_day_agenda(agenda,url):
    """
    agenda is a data frame with columns Event_Name, Location, Starting_from date, Ending date, Adress and description
    url from which we extract agenda of a specific day
    returns the agenda with the new events included if the event is not in the df yet. Returns the agenda as is otherwise.
    """
    
    soup = get_parsed_html(url)
    items = soup.select(".dades")
    
    with ThreadPoolExecutor() as pool:
        evs = list(pool.map(get_event, items))
    
    for ev in evs:     
        if not event_in_agenda(agenda,ev) :
            agenda = agenda.append(other = ev,ignore_index = True)
    return agenda

In [18]:

### WO Threading

import time

start_time = time.time()

urls = get_period_urls(get_period(ndays))
for url in urls:
    agenda = add_day_agenda(agenda,url)
    
end_time = time.time()

print(end_time-start_time)

agenda.to_csv("./agenda.csv")

11.886796474456787


AttributeError: module 'pandas' has no attribute 'agenda'