# Step 1: Scraping the search page 

When logging this process we use the class connector from the scraping_class. But to use it for our purpose, there had to be made som changes.

The changes:

1) Change Connector_type to 'selenium'

2) Insert path2selenium

3) Insert code to scroll down on page to load more html content

4) Return the connector.browser.page_source
    
    

In [109]:
import requests,os,time

def ratelimit(dt):
    "A function that handles the rate of your calls."
    time.sleep(dt) # sleep one second.

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='selenium',session=False,path2selenium='/Users/user/Downloads/chromedriver',n_tries = 5,timeout=30,waiting_time=0.5):
    """This Class implements a method for reliable connection to the internet and monitoring. 
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
    
    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case). 
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    self.waiting_time = waiting_time # define simple rate_limit parameter.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Chrome(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type
    
    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile
    header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
    if os.path.isfile(logfile):        
      if overwrite_log==True:
        self.log = open(logfile,'w')
        self.log.write(';'.join(header))
      else:
        self.log = open(logfile,'a')
    else:
      self.log = open(logfile,'w')
      self.log.write(';'.join(header))
    ## load log 
    with open(logfile,'r') as f: # open file
        
      l = f.read().split('\n') # read and split file by newlines.
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1][0])+1
            
  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
    
    Keyword arguments:
    url -- str, url
    project_name -- str, fName used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
     
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit(self.waiting_time)
        t = time.time()
        try: # error handling 
          response = self.session.get(url,timeout = self.timeout) # make get call

          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = t - time.time() # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          call_id = self.id # get current unique identifier for the call
          self.id+=1 # increment call id
          #['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
          self.log.write('\n'+';'.join(map(str,row))) # write log.
          self.log.flush()
          return response,call_id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt = t - time.time() # define delta t

          ## log...
          call_id = self.id # define unique identifier
          self.id+=1 # increment call_id

          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
          self.log.write('\n'+';'.join(map(str,row))) # write row to log.
          self.log.flush()
    else:
      t = time.time()
      ratelimit(self.waiting_time)
      self.browser.get(url) # use selenium get method
      
      # Slowly scrolling down the page, in order to load more 
      y = 1000
      for timer in range(0,2000):
          self.browser.execute_script("window.scrollTo(0, "+str(y)+")")
          y += 1000  
          time.sleep(3)
        
      ## log
      call_id = self.id # define unique identifier for the call. 
      self.id+=1 # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row 
      self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
      self.log.flush()
    # Using selenium it will not return a response object, instead you should call the browser object of the connector.
      html=connector.browser.page_source
      return html,call_id

#### Selenium request

The Vivino page has a search page, with opportunity to filter on different variables. We chose to filter on red wines from Italy and the price was set to be 0-2500+. This gave us 99 841 wines, and we were far from collecting information on them all, since scrolling usually stopped after 3000 wines, and it was time consuming. 
The page had to either be sorted by ratings, price, discount or popularity.
So we did it over several links, where we changed the way they were sorted by, to get wines from various price and rating. 

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup

# Define logfile 
logfile = 'logfile_wine.csv' ## name your log file.
connector = Connector(logfile)

start_time = time.time()

# Insert a URL of the search page
url = 'https://www.vivino.com/explore?e=eJwNijEOgCAQBH-zNZpYXmfHE4wxJyK5RMDABfX30swUM7HQgCiJDCK_NE7GwH00W7gOi7vncFLjIl75Qt6psEoKdePmCwePTIevDo8ua5_dV7tFfwGHHeQ='

# Connector class returns html and call_id
html, call_id =connector.get(url,"Searchpage")

# parse the html to a soup
soup = BeautifulSoup(html, 'lxml')

print("--- %s seconds ---" % round((time.time() - start_time),2))

#### Extracting from soup

In [107]:
import pandas as pd
# Finding the needed information in the soup and makes a dataframe

soup_select = soup.find_all('div', {'class': 'explorerCard__explorerCard--3Q7_0 explorerPageResults__explorerCard--3q6Qe'})
vivino='https://www.vivino.com'
urls=[]
names=[]
ratings=[]
region=[]

for i in soup_select:
    # find all a tags -connoting a hyperlink.
    links = i.find_all('a', {'class': 'anchor__anchor--2QZvA'}) 
    #  Save the links constructed in the urls list
    urls.append([vivino+link['href'] for link in links if link.has_attr('href')][0])
    # Saving wine titles in names
    names.append(i.find_all('span', {'class': 'vintageTitle__wine--U7t9G'})[0].text)
    # Saving total rating in ratings
    ratings.append(i.find_all('div', {'class': 'vivinoRatingWide__averageValue--1zL_5'})[0].text)
    # Saving region of the wine
    region.append(i.find_all('a', {'class': 'anchor__anchor--2QZvA vintageLocation__anchor--T7J3k'})[2].text)

'''The list are zipped and saved to a dataframe for later use. We applied drop_duplicates by link
to avoid having to have two of the same wines in our dataset. The dataframe is then saved if needed.'''

df=pd.DataFrame(zip(names,ratings,region,urls),columns=['name','rating','region','link'])
vivino=df.drop_duplicates().reset_index(drop=True)
#vivino.to_csv('vivino_random.csv')

vivino

Unnamed: 0,name,rating,region,link
0,Toscana 2008,4.7,Toscana,https://www.vivino.com/masseto-toscana/w/24467...
1,Toscana 2006,4.8,Toscana,https://www.vivino.com/masseto-toscana/w/24467...
2,Toscana 2015,4.7,Toscana,https://www.vivino.com/masseto-toscana/w/24467...
3,Toscana 2009,4.7,Toscana,https://www.vivino.com/masseto-toscana/w/24467...
4,Toscana 2007,4.7,Toscana,https://www.vivino.com/masseto-toscana/w/24467...
...,...,...,...,...
1932,Farnito Cabernet Sauvignon Toscana 2010,4.0,Toscana,https://www.vivino.com/carpineto-farnito-caber...
1933,Morellino di Scansano 2017,3.6,Morellino di Scansano,https://www.vivino.com/fattoria-le-pupille-mor...
1934,Barbera d'Alba 2016,3.8,Barbera d'Alba,https://www.vivino.com/pio-cesare-barbera-d-al...
1935,Bricco Ambrogio Barolo 2014,4.2,Barolo,https://www.vivino.com/lodali-barolo-bricco-am...


In [None]:
# Optionally: if two dataframes have to be merged to create a big dataframe

#Load the other dataframe/dataframes
df_loaded=pd.read_csv('Insert file name.csv', index_col=[0])

# Merging two dataframes
merged=pd.concat([df_loaded,vivino])

#drop duplicates and reset index 
df3=merged.drop_duplicates().reset_index(drop=True)

# Save the merged dataframe 
df3.to_csv('Insert filename of your choice.csv')

# Step 2: Scraping each link

### Collecting data with selenium
In this part we need to call different libraries, most importantly selenium and random_user_agent.

We started a class called request in which the function get_selenium_res is defined. The function utilizes the random_user_agent module imported to automatically procure a random user agent for each selenium call we make. From there on we altered some chrome options, such as to go ”— headless”, which means that the chrome browser won’t physically open on our machines, the reason for that is to reduce the load on the machine’s cpu and to be able to loop over more  than one URL. Another option which had to be altered is “— window-size”; The window size must be stretched, since selenium only will return the domain that it was required to load.  
   When opening a browser, we use WebDriverWait with a time frame of 90 seconds in which chrome will not close or return the html before a particular element is rendered. The element we chose has the specified particular class 'indicatorBar__progress--3aXLX', this will give us the information on the different taste profiles, later on the process, we discovered there were a few pages without this class name, so it had to be changed to 'vivinoRatingWide__averageValue--1zL_5'. The function will return the html of the page if succeeded, if not either a Timeout Exception or a WebDriverException will occur. In case of an exception; We return the same get_selenium_res function, since we would like to automatically retry. 


In [266]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.common.exceptions import TimeoutException, WebDriverException,ElementNotVisibleException
from random_user_agent.user_agent import UserAgent 
from random_user_agent.params import SoftwareName, OperatingSystem 
from time import sleep
import logging
import traceback
from self import self
 
class Request:
    
    # These are used to keep track of progress in the log 
    selenium_retries=0
    nr=1
    
    # Configuring a new file to save the log
    logging.basicConfig(filename="TheScraping.log",
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        filemode='w')
    #Creating af logger object, in order to reference to the same logger object
    logger=logging.getLogger('wine_project.requests')
    
    # Setting the logging level to info
    logger.setLevel(logging.INFO) 
  
    def __init__(self,url):
        self.url = url
   
    def get_selenium_res(self,class_name):
        # Keeping track of how many requests has been made to the same URL
        self.logger.info('sending request #: '+str(self.nr)+' to '+str(self.url))
        self.nr+=1
        
        try:
            # This part is to generate various user agents
            software_names=[SoftwareName.CHROME.value]
            operating_systems=[OperatingSystem.WINDOWS.value,
                                  OperatingSystem.LINUX.value]
            user_agent_rotator=UserAgent(software_names=software_names,
                                            operating_systems=operating_systems,
                                            limit=100)
            user_agent=user_agent_rotator.get_random_user_agent()
             
            # Options for chrome is defines    
            chrome_options=Options()
            chrome_options.add_argument("--headless") # Chrome won't open a browser everytime
            chrome_options.add_argument("--no-sandbox") # The only way to get chrome to open headlessly  
            chrome_options.add_argument("--window-size=1420,1080") # Stretching the window size
            chrome_options.add_argument(f'user-agent={user_agent}') # Inserting a random user-agent 
            
            # Setting driver with the specific options and pathway to the chromedriver
            # nb: Change the pathway to chromedriver 
    
            browser = webdriver.Chrome('/Users/lilly/Desktop/chromedriver',options=chrome_options)
            
            # Sending request for the html
            browser.get(self.url)
             
            '''Webdriverwait sets a time frame for the page to load, chrome won't return the dom 
                until class_name variable is retrieved'''
            time_to_wait=90    
            WebDriverWait(browser,time_to_wait).until(
                EC.presence_of_element_located((By.CLASS_NAME, class_name)))
            
        '''If selenium request wasn’t able to obtain any response in the time frame set up, 
            it's a sign of getting blocked, and it will raise a TimeoutException, in case of an
            exception we simply return the same get_selenium_res function'''
        
        except(TimeoutException,WebDriverException):
            # Logging the error and counting selenium retries 
            self.logger.error(traceback.format_exc())
            self.selenium_retries += 1
            self.logger.info('Selenium retry #: '+ str(self.selenium_retries))
            return self.get_selenium_res(class_name)
        
        # If no exception is raised this code will run and returns the pages html dom.
        else:
            browser.maximize_window()
            page_html = browser.page_source
            browser.close()
            # If succeeded the log will print success
            self.logger.info('Success')
            return page_html
        

#### Loading the csv file containing the links

In [344]:
import pandas as pd

#Getting the file and creating af panda data frame 
vivino=pd.read_csv('vivino_v7.csv',index_col=[0])

#Drop duplicates by link                
vivino=vivino\
            .drop_duplicates(['link'])\
            .reset_index(drop=True)

#### Looping through all links 

The last part of the loop where the rest of information is contained in a script tag, we need to use regex to find the information, since there's no attributes. We tried to convert the script to a json, but we weren't able to do so. Examining the script led to finding some patterns, that we could use to find and take the information needed.  

In [337]:
from bs4 import BeautifulSoup
import re
import numpy as np
from tqdm.notebook import tqdm

B=[]

for a in tqdm(vivino['link'][0:180]):
    A=[]
    
    '''Creating a soup with BeautifulSoup an by calling the class request with the function get_selenium_res.
    Class name is optinal and passed as a string'''

    soup = BeautifulSoup(Request(a).get_selenium_res('vivinoRatingWide__averageValue--1zL_5'), 'lxml')
    
 # Retrieving the average price
    p=soup.find_all('span', {'class': 'purchaseAvailabilityPPC__amount--2_4GT'})
    # if the list p is empty, there might not be an average price, but a current price. 
    if not p:
        g=soup.find_all('span', {'class': 'purchaseAvailability__currentPrice--3mO4u'})
        # if g is empty, there's none of the above, and a np.nan will get pended. 
        if not g:
            A.append(np.nan)
        else:
            A.append(soup.find_all('span', {'class': 'purchaseAvailability__currentPrice--3mO4u'})[0].text)
    else:
        A.append(soup.find_all('span', {'class': 'purchaseAvailabilityPPC__amount--2_4GT'})[0].text)
    
 # Retrieving total number of ratings
    ratings=soup.find_all('div', {'class': 'vivinoRatingWide__basedOn--s6y0t'})[0].text
    # If list empty append nan
    if not ratings:
        A.append(np.nan)
    # else we remove anything that's not a digit in ratings and are left with a number 
    else: 
        A.append(re.sub(r'[^\d]','',ratings))
    
 # Retrieving total number of reviews 
    reviews=soup.find_all('div', {'class': 'tasteCharacteristics__averageOfReviews--kut7x'})
    # If list empty append nan
    if not reviews:
        A.append(np.nan)
    # Else search for the word 'på', since this is always in front of the number needed
    else:
        r=str(reviews[0].text)
        # Using the span to find placement of 'på' in string
        n=re.search(r'(på)',r).span(0)[1]
        # only keeping anything after 'på'
        m=r[n:]
        # Append the number by searching for any character before a letter
        A.append(re.search(r'^(.+?)[a-zA-Z]', m)[1])
        
    
 # Retrieving the tasteprofiles
    bar = soup.find_all('span', {'class': 'indicatorBar__progress--3aXLX'})
    
    # If bar is empty append nan in all 4 variables
    if not bar:
        for i in range(0,4):
            A.append(np.nan)
    #Else remove anything thats not a digit or a '.' from the 4 span tags with the specific class name
    else:
        for i in range(0,4):
            A.append(re.sub(r'[^\d\.]','',bar[i]['style'][18:]))

''' The rest of the variables can all be found in a script tag '''
    script = str(soup.find_all("script"))
    
 # Retrieving alcohol percent 
    n=[]
    # Searching for all groups of "alcohol":, and looping through the matches
    # The script contains usually two of these, and it's the second one we need
    for match in re.finditer(r'("alcohol":)',script):
        n.append(match.span(0))
    span=len(n) # The total number of matches
    
    # if theres 0 or 1 match, no alcohol% is declared 
    if span==0 or span==1:
        A.append(np.nan)
    '''If there's more than one match, we find the destination of the last character in the 
    construction, "alcohol": , from the second match'''
    else:
        m=script[n[span-1][span-1]:]
        # Some turned out to not be the right alc.pct, so nan was appended
        if bool(re.match(r'(minimum)', re.search(r'^(.+?),', m)[1]))==True:
            A.append(np.nan)
        # If the right information is fetched, we take any digit after "alcohol": , until a comma character appears
        else:
            A.append(re.sub(r'\}','',re.search(r'^(.+?),', m)[1]))

 # Retrieving sugar g/L
    # if theres no match append nan
    if re.search(r'(residual_sugar_grams_per_liter)',script)==None:
        A.append(np.nan)
    # If theres a match we find the destination in the script by using span
    else:
        n=re.search(r'(residual_sugar_grams_per_liter)',script).span(0)[1]
        m=script[ n+2:]
        # Recover the characters coming before a comma appears
        A.append(re.search(r'^(.+?),', m)[1])

 # Retrieving acidity g/L 
    # Same method is used as the above
    if re.search(r'(acidity_grams_per_liter)',script)==None:
        A.append(np.nan)
    else:
        n=re.search(r'(acidity_grams_per_liter)',script).span(0)[1]
        m=script[n+2:]
        A.append(re.search(r'^(.+?),', m)[1])

 # Retrieving winery
    # The winery is found through attributes. These are surrounded by \n, which are removed by regex. 
    A.append(re.sub("(\n)", "", soup.find_all('a', {'class': 'winery'})[0].text))
    
    B.append(A)

HBox(children=(FloatProgress(value=0.0, max=180.0), HTML(value='')))




#### Creating dataframe

The B retrieved from the loop is a list of lists, and is easily converted to a dataframe. The columns are:

- price
- total_ratings
- light-bold
- smooth-tannic
- dry-sweet
- soft-acidic
- alcohol_pct
- residual_sugar_g/l
- acidity_g/l
- winery


In [345]:
# Creating dataframe
df_new=pd.DataFrame(B, columns=['price', 'total_ratings','total_reviews','light-bold',
                            'smooth-tannic','dry-sweet','soft-acidic','alcohol_pct',
                            'residual_sugar_g/l','acidity_g/l','winery'])
#df_new.head()

In the beginning we were only able to loop through a few links at a time, so we had to merge dataframes regularly. 
The code has then been improved to loop through all the links we wish now. 

In [340]:
# Merge df_new with the previous dataframe df_pre
merged=pd.concat([df_pre,df_new]).reset_index(drop=True)
#Remember to save the 
df_pre=merged
df_pre.to_csv('Vivino_dataset.csv')

In [63]:
# Optionally: if two dataframes have to be merged to create a big dataframe

#Load the other dataframe/dataframes
df_loaded=pd.read_csv('Insert file name.csv', index_col=[0])

# Merging two dataframes
merged=pd.concat([df_loaded,vivino])

#drop duplicates and reset index 
df3=merged.drop_duplicates().reset_index(drop=True)

# Save the merged dataframe 
df3.to_csv('Insert filename of your choice.csv')
