In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import pandas as pd
import pickle

In [2]:
def get_style_links():
    # we start on the styles page because it is the easiest page 
    # from which to access all beers
    
    followable_links = []
    url = 'https://www.beeradvocate.com/beer/styles/'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    # get style links

    styles_links = [link for link in soup.find_all('a') if 'beer/styles' in str(link)]

    styles_nums = []

    for link in styles_links[2:]:
        styles_nums.append(str(link).split('/')[3])
        
    styles_nums = sorted(styles_nums)[1:]
    
    for num in styles_nums:
        followable_links.append(url + num)
    
    return followable_links

In [28]:
style_links = get_style_links()
style_links[20]

'https://www.beeradvocate.com/beer/styles/15'

In [5]:
def get_brew_beer_links(url):
    
    followable_links = []
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    beer_links = [link for link in soup.find_all('a') if '/beer/profile/' in str(link)]
    
    beer_nums = []

    for link in beer_links:
        beer_num = str(link).split('/')
        if len(beer_num) > 6:

            # take both brewering number and beer number

            brew, beer = beer_num[3:5]
            brew_beer = brew + '/' + beer + '/'
            beer_nums.append(brew_beer)
        else:
            pass
    
    prof_url = 'https://www.beeradvocate.com/beer/profile/'
    
    for num in beer_nums:
        followable_links.append(prof_url + num)
        
    # returns highest sort value for styles page
    
    style_max_ref = soup.find_all('a')[117]
    style_max = re.split(r'=|"', str(style_max_ref))[-2]
    
    return followable_links

In [13]:
brew_beer_links = get_brew_beer_links(style_links[0])
brew_beer_link = brew_beer_links[0]
print(brew_beer_link)

https://www.beeradvocate.com/beer/profile/388/5281/


In [23]:
def get_style_max(url):

    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    try:
        style_max_ref = soup.find_all('a')[-137]
        style_max = re.split(r'=|"', str(style_max_ref))[-2]
        style_max = int(style_max)
    except:
        style_max = 0
    
    return style_max

In [15]:
style_max = get_style_max(style_links[0])
style_max

500

Do I also want to separately track score within beer type??? I don't think I've gotten that...

In [16]:
def get_beer_info(url):
    
    # Getting soup
    
    key = []
    values = []
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    # Finding feature values
    
    try:
        title_div = soup.find(class_='titleBar')
        beer_brew_names = str(title_div.text).replace('\n','')
    except:
        beer_brew_names = 'NA'
        
    try:
        score_obj = soup.find('span', {'class': 'BAscore_big'})
        score = re.split(r'>|<', str(score_obj))[4]
    except:
        score = 'NA'
        
    try:   
        beer_class_obj = soup.find_all('b')[4]
        beer_class = re.split(r'>|<', str(beer_class_obj))[2]
    except:
        beer_class = 'NA'
    
    stats_objs = soup.find_all('dd')[5:13]
    stats_list = []
    for item in stats_objs:
        item = str(item.text).strip()
        if len(item) > 0:
            clean_item = re.sub(r'#|%|,','',item)
            if '.' in clean_item:
                stats_list.append(clean_item)
            else:
                stats_list.append(clean_item)
        else:
            pass
        
    if len(stats_list) < 7:
        stats_list = ('NA '*7).split()
    
    ranking, reviews, ratings, pDev, wants, gots, trade = stats_list

    brew_objs = soup.find_all('a')[119:122]
    brew_objs_list = []
    for obj in brew_objs:
        brew_objs_list.append(obj.text)
    brew, region, site = brew_objs_list

    try:
        info_div = soup.find('div',attrs={'id':'info_box'})
        info_sub_div = re.split(r'</b></a>\n<br/><br/>\n<b>',str(info_div))[-2]
        style = info_sub_div.split('/"><b>')[-1]
        style
    except:
        style = 'NA'
    
    # checks length to handle exceptions when ABV is not provided
    
    abv_div = soup.find('div',attrs={'id':'info_box'})
    abv_div_list = str(abv_div).split('(ABV):</b>')

    try:
        abv = abv_div_list[1][1:5]
    except:
        abv = np.nan

    try:            
        availability_str = str(abv_div).split('Availability:</b> ')[1]
        availability = availability_str.split('\n')[0]
    except:
        availability = 'NA'
    
    try:   
        comm_desc_str = str(abv_div).split('Description:</b>\n<br/>\n')[1]
        comm_desc = comm_desc_str.split('<br')[0]
    except:
        comm_desc = 'NA'
        
    try:
        date_added_str = str(abv_div).split('<br/><br/>')[-2]
        date_added = date_added_str.split()[-1]
    except:
        date_added = 'NA'
        
    
    key = beer_brew_names 
    
    values = [score, beer_class, ranking, reviews, ratings, pDev, 
                            wants, gots, trade, brew, region, site, style, abv, 
                            availability, comm_desc, date_added]
    
    return key, values

In [18]:
key, values = get_beer_info(brew_beer_link)
print(key)
print(values)

Cantillon Fou' Foune | Brasserie Cantillon
['4.65', 'World-Class', '23', '562', '3079', '7.31', '2811', '669', '58', 'Brasserie Cantillon', 'Belgium', 'cantillon.be', 'Belgian Fruit Lambic', '5.00', 'Rotating', 'Apricot Lambic', '08-05-2002']


In [19]:
def open_new_style_page(url,counter):
    out = url + '/?sort=revsD&start=' + str(counter)
    
    return out
    

In [20]:
open_new_style_page(style_links[0],0)

'https://www.beeradvocate.com/beer/styles/10/?sort=revsD&start=0'

## Master Function

In [29]:
def compile_beer_info():
    
    style_links = get_style_links()
    
    # pick up where we left off
    
    style_links = style_links[20:]
    
    for style_link in style_links:
        
        style_max = get_style_max(style_link)

        # For each style link we add the suffix used
        # to sort. Sorting is done in groups of 50.
        # Our counter is used to specify where sorting
        # will occur on the next loaded page.

        # Each style like will have multple pages.

        counter = 0

        while counter <= style_max:
            
            dic = {}

            url = open_new_style_page(style_link,counter)

            brew_beer_links = get_brew_beer_links(url)

            for brew_beer_link in brew_beer_links:

                key, values = get_beer_info(brew_beer_link)

                dic[key] = values

                time.sleep(np.random.poisson(10)/100)

            string = re.findall(r'\d+', url)
            name = str(string[0]) + '_' + str(string[-1])

            df = pd.DataFrame(dic)     
            df = df.transpose()      
            pd.DataFrame.to_pickle(df,name)

            counter += 50
    
    return 

In [None]:
%%time
compile_beer_info()

In [144]:
len(list(test2.keys()))

524

In [145]:
df2 = pd.DataFrame(test2)

In [146]:
df2 = df2.transpose()

In [147]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 524 entries, Cantillon Fou' Foune | Brasserie Cantillon to Amuse Gooseberry | The Wild Beer Co.
Data columns (total 17 columns):
0     524 non-null object
1     524 non-null object
2     524 non-null object
3     524 non-null object
4     524 non-null object
5     524 non-null object
6     524 non-null object
7     524 non-null object
8     524 non-null object
9     524 non-null object
10    524 non-null object
11    524 non-null object
12    524 non-null object
13    516 non-null object
14    524 non-null object
15    524 non-null object
16    524 non-null object
dtypes: object(17)
memory usage: 73.7+ KB


In [148]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Cantillon Fou' Foune | Brasserie Cantillon,4.65,World-Class,23,562,3079,7.31,2811,669,58,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Rotating,Apricot Lambic,08-05-2002
Lindemans Framboise | Brouwerij Lindemans,3.88,Very Good,15295,1065,2884,14.43,81,437,4,Brouwerij Lindemans,Belgium,lindemans.be,Belgian Fruit Lambic,2.50,Year-round,2.5-4.0% ABV,11-01-2000
Cantillon Kriek 100% Lambic | Brasserie Cantillon,4.36,Outstanding,569,737,2512,9.63,668,625,64,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Year-round,No notes at this time.,10-06-2001
Cantillon Rosé De Gambrinus | Brasserie Cantillon,4.32,Outstanding,830,773,2444,9.49,515,638,60,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Year-round,"""It has the colour of onion skin"", said a voic...",02-16-2002
Cantillon Saint Lamvinus | Brasserie Cantillon,4.54,World-Class,108,490,2189,7.49,1422,456,44,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Rotating,Everyone knows some of our best friends are wi...,03-20-2003
Cantillon Lou Pepe - Kriek | Brasserie Cantillon,4.62,World-Class,38,363,1802,7.36,1788,350,18,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Rotating,No notes at this time.,05-12-2002
Cantillon Vigneronne | Brasserie Cantillon,4.42,Outstanding,323,326,1456,7.69,667,273,28,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Rotating,The name Vigneronne Cantillon was given in 198...,07-23-2002
Cantillon Lou Pepe - Framboise | Brasserie Cantillon,4.58,World-Class,69,307,1368,6.99,1202,228,21,Brasserie Cantillon,Belgium,cantillon.be,Belgian Fruit Lambic,5.00,Rotating,No notes at this time.,11-03-2003
Lindemans Kriek | Brouwerij Lindemans,3.85,Very Good,17634,547,1303,13.51,53,203,2,Brouwerij Lindemans,Belgium,lindemans.be,Belgian Fruit Lambic,4.00,Year-round,Also known as Kriek Foudroyante,07-28-2001
Drie Fonteinen Oude Kriek | Brouwerij 3 Fonteinen,4.31,Outstanding,931,354,1249,7.66,203,286,34,Brouwerij 3 Fonteinen,Belgium,3fonteinen.be,Belgian Fruit Lambic,6.00,Year-round,No notes at this time.,09-27-2002
