In [1]:
# IMPORT ALL RELEVANT LIBRARIES

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from urllib.parse import urljoin
import pandas as pd
import time
import re


In [2]:
### DRIVER SETUP

service = Service(r"C:\Users\MattiaDevescovi\Desktop\chromedriver.exe")
driver = webdriver.Chrome(service=service)

# open up a specific web page
driver.get("https://www.vivino.com/IT/en/")

In [5]:
# Scroll down until the list of product links reaches a length of 1000
base_url = 'https://www.vivino.com'
prod_links = []

lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
while len(prod_links) < 712:
    html_tag = driver.find_elements(By.XPATH, '//a[@data-testid="vintagePageLink"]')
    for tag in html_tag:
        rel_url = tag.get_attribute('href')
        full_url = urljoin(base_url, rel_url)
        if len(prod_links) < 712:
            prod_links.append(full_url)
        else:
            break
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    new_len = driver.execute_script("return document.querySelectorAll('a[data-testid=\"vintagePageLink\"]').length;")
    if new_len == lenOfPage:
        break
    lenOfPage = new_len

In [6]:
len(prod_links)

712

In [7]:
# Define a function to check if the end of the page has been reached
def is_end_of_page(driver):
    return driver.execute_script("return window.scrollY + window.innerHeight >= document.body.scrollHeight")

## Start iterating on every product
for index_products, url_products in enumerate(prod_links):
    urls_list = "'"+url_products+"'"
    driver.get(url_products)
    time.sleep(5)

    # PRODUCT NAME
    try:
        product_name_tag = driver.find_element(By.XPATH, '//a[@data-cartitemsource="wine-page-master-link"]')
        product_name = product_name_tag.text
    except NoSuchElementException:
        continue

    # GRAPE
    try:
        grape_xpath = driver.find_element(By.XPATH, "//a[@data-cy='breadcrumb-grape']")
        grape = grape_xpath.text
    except NoSuchElementException:
        continue

    # PRODUCT RATING
    try:
        product_rating_tag = driver.find_element(By.XPATH, '//div[@class="vivinoRating_averageValue__uDdPM"]')
        product_rating = product_rating_tag.text
    except NoSuchElementException:
        continue

    # PRODUCT NUMBER OF RATINGS
    try:
        number_ratings_xpath = driver.find_element(By.XPATH, "//div[@class='vivinoRating_caption__xL84P']")
        number_ratings = number_ratings_xpath.text
    except NoSuchElementException:
        continue

    # PRODUCT TYPE
    try:
        product_type_xpath = driver.find_element(By.XPATH, "//a[@data-cartitemsource='breadcrumb-explore']")
        product_type = product_type_xpath.text
    except NoSuchElementException:
        continue

    # COUNTRY OF ORIGIN
    try:
        country_of_origin_xpath = driver.find_element(By.XPATH, "//a[@data-cy='breadcrumb-country']")
        country_of_origin = country_of_origin_xpath.text
    except NoSuchElementException:
        continue

    # PRODUCT PRICE
    try:
        price_xpath = driver.find_element(By.XPATH, "//span[@class='purchaseAvailability__currentPrice--3mO4u']")
        price = price_xpath.text
    except NoSuchElementException:
        continue

    ## EXTRACTION OF WINE CHARACTERISTICS
    bold_value = None
    tannic_value = None
    sweet_value = None
    acidic_value = None

    max_retries = 5 # Maximum number of retries for finding the elements
    retry_count = 0 # Counter for retries

    # Scroll down until the elements are found or the end of the page is reached
    while (bold_value is None or tannic_value is None or sweet_value is None or acidic_value is None or food_pairing is None) and not is_end_of_page(driver):
        try:
            # Bold value, tannic value, sweet value, and acidic value
            bold_elements = driver.find_elements(By.XPATH, "//span[@class='indicatorBar__progress--3aXLX']")
            for i, element in enumerate(bold_elements):
                style_attribute = element.get_attribute("style")
                percentage_value = re.search(r"left:\s*(\d+\.\d+)%", style_attribute).group(1)
                if i == 0:
                    bold_value = percentage_value + "%"
                elif i == 1:
                    tannic_value = percentage_value + "%"
                elif i == 2:
                    sweet_value = percentage_value + "%"
                elif i == 3:
                    acidic_value = percentage_value + "%"
                    
            # Food pairing
            food_pairing_xpath = driver.find_element(By.XPATH, "//a[@class='anchor_anchor__m8Qi- foodPairing__imageContainer--2CtYR']")
            food_pairing = food_pairing_xpath.text
            
            # If all elements are found, break out of the loop
            if bold_value is not None and tannic_value is not None and sweet_value is not None and acidic_value is not None and food_pairing is not None:
                break
                
        except:
            pass
        
        # Increment retry counter
        retry_count += 1
        # If maximum number of retries has been reached, move on to the next link
        if retry_count == max_retries:
            break
        
        # Scroll down and wait for the page to load
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        time.sleep(1)
    
    
    if index_products == 0:
        vivino_catalogue = pd.DataFrame(
            {'name': product_name,
            'grape': grape,
            'product rating': product_rating,
            'number ratings': number_ratings,
            'product type': product_type,
            'country of origin': country_of_origin, 
            'price': price,
            'bold_value': bold_value,
            'tannic value': tannic_value,
            'sweet_value': sweet_value,
            'acidic_value': acidic_value,
            'food_pairing': food_pairing
            },
            index=[index_products]
        )
    if index_products != 0:
        temp_vivino_catalogue = pd.DataFrame(
            {'name': product_name,
            'grape': grape,
            'product rating': product_rating,
            'number ratings': number_ratings,
            'product type': product_type,
            'country of origin': country_of_origin, 
            'price': price,
            'bold_value': bold_value,
            'tannic value': tannic_value,
            'sweet_value': sweet_value,
            'acidic_value': acidic_value,
            'food_pairing': food_pairing
            },
            index=[index_products]
        )
        data = [vivino_catalogue, temp_vivino_catalogue]
        vivino_catalogue = pd.concat(data)

NameError: name 'food_pairing' is not defined

In [14]:
vivino_catalogue

Unnamed: 0,name,grape,product rating,number ratings,product type,country of origin,price,bold_value,tannic value,sweet_value,acidic_value,food_pairing
0,Fico Susumaniello,Susumaniello,4.7,27 ratings,Red wine,Italy,€19.90,75.8029%,36.3297%,31.3312%,24.9804%,Vegetarian
1,60 Sessantanni Old Vines Primitivo di Manduria,Primitivo,4.5,15979 ratings,Red wine,Italy,€22.90,69.663%,22.9139%,46.7745%,12.1949%,Vegetarian
2,Limited Edition 10 Vendemmie,Montepulciano,4.5,8905 ratings,Red wine,Italy,€24.30,50.6136%,36.753%,19.4152%,59.7082%,Vegetarian
3,Fanova Riserva,Primitivo,4.5,135 ratings,Red wine,Italy,€17.40,79.3122%,38.7329%,30.1212%,31.5391%,Vegetarian
4,Amarone della Valpolicella Classico,Blend,4.5,58 ratings,Red wine,Italy,€27.70,,,,,Vegetarian
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Equinoxe Cabernet - Syrah,Blend,4.2,121 ratings,Red wine,France,€17.43,68.067%,56.3093%,19.2438%,55.2823%,Beef
1996,Classico Cabernet Sauvignon,Cabernet Sauvignon,4.2,114 ratings,Red wine,South Africa,€19.87,73.4789%,58.4657%,11.0102%,55.2869%,Beef
1997,Merlot,Merlot,4.2,108 ratings,Red wine,Italy,€22.80,55.2495%,42.9979%,12.743%,49.4293%,Beef
1998,Ca' Carnocchio Veronese,Blend,4.2,108 ratings,Red wine,Italy,€25.90,64.7983%,35.6674%,18.7797%,28.0679%,Beef


In [15]:
# Convert partner_menu DataFrame into a csv
path_csv = (r"C:\Users\MattiaDevescovi\Desktop\Python Scripts\Scraping Vivino\\")
vivino_catalogue.to_csv(path_csv + 'Vini Rossi.csv')