In [1]:
import requests
from bs4 import BeautifulSoup

### Extract single activity

In [3]:
def extract_activity(activity):
    activity_info = {}
    
    # Find Start + Duration
    info_tags = activity.find_all("div", {"class": "item-attributes__item__content__item--text"})
    if not info_tags:
        # if not found move to next page
        return activity_info
        
    # Extract Start Time
    activity_info['time'] = info_tags[0].string                

    # Extract duration and convert string to number       
    duration = float(info_tags[1].string.split(' ')[0])
    hours = int(duration)
    half_hours = duration % 1
    activity_info['duration'] =  hours * 60 + int(half_hours * 100)    
    
    # Find title
    title_tags = activity.find_all("h3", {"class": "atgrid__item__title"})    
    for tag in title_tags:
        children = tag.findChildren("a" , recursive=False)
        for link in children:
            activity_info['title'] = link.string
            activity_info['href'] = link['href']
            
    # Find description
    description_tag = activity.find('div', {'class': 'atgrid__item__description'})
    activity_info['descr'] = description_tag.string
    
    # Find activity type
    types_list = []
    types_tags = activity.find_all("div", {"class": "atgrid__item__icons"})
    for tag in types_tags:
        children = tag.find_all("i")
        for item in children:
            types_list.append(item['title'])
            
    activity_info['types'] = types_list        
        
    return activity_info

### Extract all pages

In [10]:
activities_list = []

In [11]:
for pagina in range(4):
    # Construct URL
    url = 'https://www.greenlinetours.com/it/page/' + str(pagina) + '/?toursearch=1&lang=it'
    
    # Retrieve page
    response = requests.get(url)
    
    # Convert to beautifulsoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract activities
    activities = soup.find_all('div', {'class': 'atgrid__item'})    
    
    # Examine one by one all activities
    for activity in activities:
        # extract content
        content = extract_activity(activity)
        
        if len(content.keys()) > 0:
            activities_list.append(content)

In [13]:
len(activities_list)

39