# Step 2: Scraping each link

### Collecting data with selenium
In this part we need to call different libraries, most importantly selenium and random_user_agent. 

In [266]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.common.exceptions import TimeoutException, WebDriverException,ElementNotVisibleException
from random_user_agent.user_agent import UserAgent 
from random_user_agent.params import SoftwareName, OperatingSystem 
from time import sleep
import logging
import traceback
from self import self
 
class Request:
    
    # These are used to keep track of progress in the log 
    selenium_retries=0
    nr=1
    
    # Configuring a new file to save the log
    logging.basicConfig(filename="TheScraping.log",
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        filemode='w')
    #Creating af logger object, in order to reference to the same logger object
    logger=logging.getLogger('wine_project.requests')
    
    # Setting the logging level to info
    logger.setLevel(logging.INFO) 
  
    def __init__(self,url):
        self.url = url
   
    def get_selenium_res(self,class_name):
        # Keeping track of how many requests has been made to the same URL
        self.logger.info('sending request #: '+str(self.nr)+' to '+str(self.url))
        self.nr+=1
        
        try:
            # This part is to generate various user agents
            software_names=[SoftwareName.CHROME.value]
            operating_systems=[OperatingSystem.WINDOWS.value,
                                  OperatingSystem.LINUX.value]
            user_agent_rotator=UserAgent(software_names=software_names,
                                            operating_systems=operating_systems,
                                            limit=100)
            user_agent=user_agent_rotator.get_random_user_agent()
             
            # Options for chrome is defines    
            chrome_options=Options()
            chrome_options.add_argument("--headless") # Chrome won't open a browser everytime
            chrome_options.add_argument("--no-sandbox") # The only way to get chrome to open headlessly  
            chrome_options.add_argument("--window-size=1420,1080") # Stretching the window size
            chrome_options.add_argument(f'user-agent={user_agent}') # Inserting a random user-agent 
            
            # Setting driver with the specific options and pathway to the chromedriver
            # nb: Change the pathway to chromedriver 
    
            browser = webdriver.Chrome('/Users/lilly/Desktop/chromedriver',options=chrome_options)
            
            # Sending request for the html
            browser.get(self.url)
             
            '''Webdriverwait sets a time frame for the page to load, chrome won't return the dom 
                until class_name variable is retrieved'''
            time_to_wait=90    
            WebDriverWait(browser,time_to_wait).until(
                EC.presence_of_element_located((By.CLASS_NAME, class_name)))
            
        '''If selenium request wasn’t able to obtain any response in the time frame set up, 
            it's a sign of getting blocked, and it will raise a TimeoutException, in case of an
            exception we simply return the same get_selenium_res function'''
        
        except(TimeoutException,WebDriverException):
            # Logging the error and counting selenium retries 
            self.logger.error(traceback.format_exc())
            self.selenium_retries += 1
            self.logger.info('Selenium retry #: '+ str(self.selenium_retries))
            return self.get_selenium_res(class_name)
        
        # If no exception is raised this code will run and returns the pages html dom.
        else:
            browser.maximize_window()
            page_html = browser.page_source
            browser.close()
            # If succeeded the log will print success
            self.logger.info('Success')
            return page_html
        

#### Loading the csv file containing the links

In [344]:
import pandas as pd

#Getting the file and creating af panda data frame 
vivino=pd.read_csv('vivino_v7.csv',index_col=[0])

#Drop duplicates by link                
vivino=vivino\
            .drop_duplicates(['link'])\
            .reset_index(drop=True)

#### Looping through all links 

The last part of the loop where the rest of information is contained in a script tag, we need to use regex to find the information, since there's no attributes. We tried to convert the script to a json, but we weren't able to do so. Examining the script led to finding some patterns, that we could use to find and take the information needed.  

In [337]:
from bs4 import BeautifulSoup
import re
import numpy as np
from tqdm.notebook import tqdm

B=[]

for a in tqdm(vivino['link'][0:180]):
    A=[]
    
    '''Creating a soup with BeautifulSoup an by calling the class request with the function get_selenium_res.
    Class name is optinal and passed as a string'''

    soup = BeautifulSoup(Request(a).get_selenium_res('vivinoRatingWide__averageValue--1zL_5'), 'lxml')
    
 # Retrieving the average price
    p=soup.find_all('span', {'class': 'purchaseAvailabilityPPC__amount--2_4GT'})
    # if the list p is empty, there might not be an average price, but a current price. 
    if not p:
        g=soup.find_all('span', {'class': 'purchaseAvailability__currentPrice--3mO4u'})
        # if g is empty, there's none of the above, and a np.nan will get pended. 
        if not g:
            A.append(np.nan)
        else:
            A.append(soup.find_all('span', {'class': 'purchaseAvailability__currentPrice--3mO4u'})[0].text)
    else:
        A.append(soup.find_all('span', {'class': 'purchaseAvailabilityPPC__amount--2_4GT'})[0].text)
    
 # Retrieving total number of ratings
    ratings=soup.find_all('div', {'class': 'vivinoRatingWide__basedOn--s6y0t'})[0].text
    # If list empty append nan
    if not ratings:
        A.append(np.nan)
    # else we remove anything that's not a digit in ratings and are left with a number 
    else: 
        A.append(re.sub(r'[^\d]','',ratings))
    
 # Retrieving total number of reviews 
    reviews=soup.find_all('div', {'class': 'tasteCharacteristics__averageOfReviews--kut7x'})
    # If list empty append nan
    if not reviews:
        A.append(np.nan)
    # Else search for the word 'på', since this is always in front of the number needed
    else:
        r=str(reviews[0].text)
        # Using the span to find placement of 'på' in string
        n=re.search(r'(på)',r).span(0)[1]
        # only keeping anything after 'på'
        m=r[n:]
        # Append the number by searching for any character before a letter
        A.append(re.search(r'^(.+?)[a-zA-Z]', m)[1])
        
    
 # Retrieving the tasteprofiles
    bar = soup.find_all('span', {'class': 'indicatorBar__progress--3aXLX'})
    
    # If bar is empty append nan in all 4 variables
    if not bar:
        for i in range(0,4):
            A.append(np.nan)
    #Else remove anything thats not a digit or a '.' from the 4 span tags with the specific class name
    else:
        for i in range(0,4):
            A.append(re.sub(r'[^\d\.]','',bar[i]['style'][18:]))

''' The rest of the variables can all be found in a script tag '''
    script = str(soup.find_all("script"))
    
 # Retrieving alcohol percent 
    n=[]
    # Searching for all groups of "alcohol":, and looping through the matches
    # The script contains usually two of these, and it's the second one we need
    for match in re.finditer(r'("alcohol":)',script):
        n.append(match.span(0))
    span=len(n) # The total number of matches
    
    # if theres 0 or 1 match, no alcohol% is declared 
    if span==0 or span==1:
        A.append(np.nan)
    '''If there's more than one match, we find the destination of the last character in the 
    construction, "alcohol": , from the second match'''
    else:
        m=script[n[span-1][span-1]:]
        # Some turned out to not be the right alc.pct, so nan was appended
        if bool(re.match(r'(minimum)', re.search(r'^(.+?),', m)[1]))==True:
            A.append(np.nan)
        # If the right information is fetched, we take any digit after "alcohol": , until a comma character appears
        else:
            A.append(re.sub(r'\}','',re.search(r'^(.+?),', m)[1]))

 # Retrieving sugar g/L
    # if theres no match append nan
    if re.search(r'(residual_sugar_grams_per_liter)',script)==None:
        A.append(np.nan)
    # If theres a match we find the destination in the script by using span
    else:
        n=re.search(r'(residual_sugar_grams_per_liter)',script).span(0)[1]
        m=script[ n+2:]
        # Recover the characters coming before a comma appears
        A.append(re.search(r'^(.+?),', m)[1])

 # Retrieving acidity g/L 
    # Same method is used as the above
    if re.search(r'(acidity_grams_per_liter)',script)==None:
        A.append(np.nan)
    else:
        n=re.search(r'(acidity_grams_per_liter)',script).span(0)[1]
        m=script[n+2:]
        A.append(re.search(r'^(.+?),', m)[1])

 # Retrieving winery
    # The winery is found through attributes. These are surrounded by \n, which are removed by regex. 
    A.append(re.sub("(\n)", "", soup.find_all('a', {'class': 'winery'})[0].text))
    
    B.append(A)

HBox(children=(FloatProgress(value=0.0, max=180.0), HTML(value='')))




#### Creating dataframe

The B retrieved from the loop is a list of lists, and is easily converted to a dataframe. The columns are:

- price
- total_ratings
- light-bold
- smooth-tannic
- dry-sweet
- soft-acidic
- alcohol_pct
- residual_sugar_g/l
- acidity_g/l
- winery


In [345]:
# Creating dataframe
df_new=pd.DataFrame(B, columns=['price', 'total_ratings','total_reviews','light-bold',
                            'smooth-tannic','dry-sweet','soft-acidic','alcohol_pct',
                            'residual_sugar_g/l','acidity_g/l','winery'])
#df_new.head()

In the beginning we were only able to loop through a few links at a time, so we had to merge dataframes regularly. 
The code has then been improved to loop through all the links we wish now. 

In [340]:
# Merge df_new with the previous dataframe df_pre
merged=pd.concat([df_pre,df_new]).reset_index(drop=True)
#Remember to save the 
df_pre=merged
df_pre.to_csv('Vivino_dataset.csv')