In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import pandas as pd
import pickle

In [2]:
def get_style_links():
    
    """
    No inputs required. 
    
    The styles page on beer advocate is provided within the function as the URL used in scraping.
    It was chosen due to the fact that it is the easiest page from which to access all beers within beer advocate.
    
    Searches through the page and finds any hyperlink containing the string 'beer/styles' and selects the number from the URL.
    Then creates links from the initial beer styles URL and the beer styles numbers.
    The output is a list of followable links.
    """
    
    followable_links = []
    url = 'https://www.beeradvocate.com/beer/styles/'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    styles_links = [link for link in soup.find_all('a') if 'beer/styles' in str(link)]

    styles_nums = []

    for link in styles_links[2:]:
        styles_nums.append(str(link).split('/')[3])
        
    styles_nums = sorted(styles_nums)[1:]
    
    for num in styles_nums:
        followable_links.append(url + num)
    
    return followable_links

In [32]:
style_links = get_style_links()
style_links[28]

'https://www.beeradvocate.com/beer/styles/16'

In [5]:
def get_brew_beer_links(url):
    
    """
    Takes a URL of a style page as input and outputs a list of the <= 50 beer profile hyperlinks on that page. 
    
    Note that the name of this function is 'get_brew_beer_links' as opposed to simply 'get_beer_links' as every link to a beer page is a subpage of a brewery.
    e.g. 'https://www.beeradvocate.com/beer/profile/' + Brewery_Number + '/' + Beer_Number
    
    """
    
    followable_links = []
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    beer_links = [link for link in soup.find_all('a') if '/beer/profile/' in str(link)]
    
    beer_nums = []

    for link in beer_links:
        beer_num = str(link).split('/')
        if len(beer_num) > 6:

            # take both brewering number and beer number

            brew, beer = beer_num[3:5]
            brew_beer = brew + '/' + beer + '/'
            beer_nums.append(brew_beer)
        else:
            pass
    
    prof_url = 'https://www.beeradvocate.com/beer/profile/'
    
    for num in beer_nums:
        followable_links.append(prof_url + num)

    
    return followable_links

In [13]:
brew_beer_links = get_brew_beer_links(style_links[0])
brew_beer_link = brew_beer_links[0]
print(brew_beer_link)

https://www.beeradvocate.com/beer/profile/388/5281/


In [23]:
def get_style_max(url):
    
    """
    Takes the URL for a style page as input. 
    Returns the highest sort value for that styles page.
    Each style has anywhere between 10s and tens of thousands of beers. The highest sort value indicates the last page for each style.
    For style with fewer than 50 beers the function will be unable to pull a value and will thus set the value equal to 0.
    
    The output of this will be used to ensure that the compiler function never attempts to go beyond the number of pages available for a particular style. If it were to do so, it would either cause an error or continue infinitely.    
    """     


    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    try:
        style_max_ref = soup.find_all('a')[-137]
        style_max = re.split(r'=|"', str(style_max_ref))[-2]
        style_max = int(style_max)
    except:
        style_max = 0
    
    return style_max

In [15]:
style_max = get_style_max(style_links[0])
style_max

500

Do I also want to separately track score within beer type??? I don't think I've gotten that...

In [16]:
def get_beer_info(url):
    
    """
    Takes a link to a beer page as input, scrapes the page and stores the contents in a dictionary.
    
    The following are the values that will be captured from the beer page:
    
    score: Average of ratings on Beer Advocate.
    beer_class: Categical ranking of Brewery. Outstanding, Good, Okay, etc.
    ranking: How the beer ranks against all other beers on Beer Advocate
    reviews: The number of reviews for a brewery.
    ratings: The number of ratings for a brewery. (Note that reviews are different from ratings in the reviews include a text response while ratings are only numeric).
    pDev: The percent deviation of ratings for a brewery.
    wants: The number of people that has indicated that they want the beer.
    gots: The number of people that has indicated that they have the beer.
    trade: The number of people that has indicated that they are willing to trade for the beer.
    brew: The name of the brewery that produces the beer.
    region: Where the brewery is located.
    site: The website for the brewery.
    style: The style of the beer.
    abv: The Alcohol by Volume
    availability: When the beer is available, e.g. Seasonally, winter, year-round, etc.
    comm_desc: Notes about the beer.
    date_added: When the beer was added to Beer Advocate.
    """
    
    key = []
    values = []
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    # Finding feature values
    
    try:
        title_div = soup.find(class_='titleBar')
        beer_brew_names = str(title_div.text).replace('\n','')
    except:
        beer_brew_names = 'NA'
        
    try:
        score_obj = soup.find('span', {'class': 'BAscore_big'})
        score = float(re.split(r'>|<', str(score_obj))[4])
    except:
        score = np.nan
        
    try:   
        beer_class_obj = soup.find_all('b')[4]
        beer_class = re.split(r'>|<', str(beer_class_obj))[2]
    except:
        beer_class = 'NA'
    
    stats_objs = soup.find_all('dd')[5:13]
    stats_list = []
    for item in stats_objs:
        item = str(item.text).strip()
        if len(item) > 0:
            clean_item = re.sub(r'#|%|,','',item)
            if '.' in clean_item:
                stats_list.append(float(clean_item))
            else:
                stats_list.append(int(clean_item))
        else:
            pass
        
    if len(stats_list) < 7:
        stats_list = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    
    ranking, reviews, ratings, pDev, wants, gots, trade = stats_list

    brew_objs = soup.find_all('a')[119:122]
    brew_objs_list = []
    for obj in brew_objs:
        brew_objs_list.append(obj.text)
    brew, region, site = brew_objs_list

    try:
        info_div = soup.find('div',attrs={'id':'info_box'})
        info_sub_div = re.split(r'</b></a>\n<br/><br/>\n<b>',str(info_div))[-2]
        style = info_sub_div.split('/"><b>')[-1]
        style
    except:
        style = 'NA'
    
    # checks length to handle exceptions when ABV is not provided
    
    abv_div = soup.find('div',attrs={'id':'info_box'})
    abv_div_list = str(abv_div).split('(ABV):</b>')

    try:
        abv = float(abv_div_list[1][1:5])
    except:
        abv = np.nan

    try:            
        availability_str = str(abv_div).split('Availability:</b> ')[1]
        availability = availability_str.split('\n')[0]
    except:
        availability = 'NA'
    
    try:   
        comm_desc_str = str(abv_div).split('Description:</b>\n<br/>\n')[1]
        comm_desc = comm_desc_str.split('<br')[0]
    except:
        comm_desc = 'NA'
        
    try:
        date_added_str = str(abv_div).split('<br/><br/>')[-2]
        date_added = date_added_str.split()[-1]
    except:
        date_added = 'NA'
        
    
    key = beer_brew_names 
    
    values = [score, beer_class, ranking, reviews, ratings, pDev, 
                            wants, gots, trade, brew, region, site, style, abv, 
                            availability, comm_desc, date_added]
    
    return key, values

In [18]:
key, values = get_beer_info(brew_beer_link)
print(key)
print(values)

Cantillon Fou' Foune | Brasserie Cantillon
['4.65', 'World-Class', '23', '562', '3079', '7.31', '2811', '669', '58', 'Brasserie Cantillon', 'Belgium', 'cantillon.be', 'Belgian Fruit Lambic', '5.00', 'Rotating', 'Apricot Lambic', '08-05-2002']


In [19]:
def open_new_style_page(url,counter):
    
    """
    Takes a URL of a style page as input and concatenates it with string used for sorting as well as the counter. The counter is the current sort-by value.
    """
    
    out = url + '/?sort=revsD&start=' + str(counter)
    
    return out
    

In [20]:
open_new_style_page(style_links[0],0)

'https://www.beeradvocate.com/beer/styles/10/?sort=revsD&start=0'

## Master Function

In [None]:
def compile_beer_info():
    
    """
    No inputs required.
    
    Obtains all style links from the style links function.
    
    For each style link, finds the style max (i.e. the last page of beers for that style) and sets a counter to 0.
    
    For each style link combined with the counter; obtains a list of urls of all beer pages on the respective style page, scrapes each page, then increase the counter by 50 so as to go to the next style page, obtain a list of beer links, and scrape each beer page.
    
    The function will then save the dictionary as a pandas DataFrame.
    
    """
    
    style_links = get_style_links()
    
    # pick up where we left off
    
    style_links = style_links[28:]
    
    for style_link in style_links:
        
        style_max = get_style_max(style_link)

        # For each style link we add the suffix used
        # to sort. Sorting is done in groups of 50.
        # Our counter is used to specify where sorting
        # will occur on the next loaded page.

        # Each style like will have multple pages.

        counter = 0

        while counter <= style_max:
            
            dic = {}

            url = open_new_style_page(style_link,counter)

            brew_beer_links = get_brew_beer_links(url)

            for brew_beer_link in brew_beer_links:

                key, values = get_beer_info(brew_beer_link)

                dic[key] = values

                time.sleep(np.random.poisson(10)/100)

            string = re.findall(r'\d+', url)
            name = str(string[0]) + '_' + str(string[-1])

            df = pd.DataFrame(dic)     
            df = df.transpose()      
            pd.DataFrame.to_pickle(df,name)

            counter += 50
    
    return 

In [35]:
%%time
compile_beer_info()