In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import pandas as pd
import pickle
import os

In [2]:
os.chdir('Data2/')

1. Gets a list of all of the names of files in the Data2/ folder.  
* Other files have been saved to this location so we check to make sure that the file is actually an output of our beer scraper function. 
    * All of the outputted pickles have been saved with integer names. 
    * We check to see if the name of the file can be converted to an int and, if so, append it to our style nums list.  
* That list is used to open and concatenate all of the pickles for scraped beer data.  
* Finally we take the unique Brewery Numbers from the `Brewery_Nums` column of our DataFrame. This is the list of all brewery urls on Beer Advocate and will be used in the below function.

In [3]:
dir_files = os.listdir()

style_nums = []
for file in dir_files:
    try:
        int(file)
        style_nums.append(file)
    except:
        pass
    
df = pd.DataFrame(columns = ['beer_name','brewery_name','abv',
                             'ratings','score','brewery_nums'])

for num in style_nums:
    pickled = pd.read_pickle(num)
    df = pd.concat([df,pickled],sort=True)

In [4]:
df.head()

Unnamed: 0,abv,beer_name,brewery_name,brewery_nums,ratings,score,style
0,7.5,Guinness Foreign Extra Stout,Guinness Ltd.,209,2468,4.06,Foreign / Export Stout
1,8.8,Lion Stout,Ceylon / Lion Brewery Limited,389,1655,3.91,Foreign / Export Stout
2,7.0,Indra Kunindra,Ballast Point Brewing Company,199,1285,3.67,Foreign / Export Stout
3,8.5,Fade To Black - Volume 1,Left Hand Brewing Company,418,1023,4.08,Foreign / Export Stout
4,6.3,Best Extra Stout,Coopers Brewery Limited,491,691,3.84,Foreign / Export Stout


In [5]:
brewery_nums = df['brewery_nums'].unique()

In [6]:
def get_brewery_info(brew_nums):
    
    """
    Takes a list of Brewery URLs for https://www.beeradvocate.com and scrapes each page and stores the contents in a dictionary. 
    The function will attempt to save the dictionary as a pandas DataFrame.
    
    The following are the values that will be captured from each brewery page:
    
    score: The average of all beers with ratings for that brewery
    brew_class: Categical ranking of Brewery. Outstanding, Good, Okay, etc.
    num_beers: The number of beers produced by the brewery and listed on beer advocate.
    town: The town where the brewery is located. (This will return city for international breweries).
    state_region: The state where the brewery is located. (This will return country for internatinoal breweries).
    country: The country where the brewery is located.
    num_beer_reviews: Total number of reviews for all beers for a brewery.
    num_beer_ratings: Total number of ratings for all beers for a brewery. (Note that reviews are different from ratings in the reviews include a text response while ratings are only numeric).
    brew_score: Our target. The average score across all ratings for a brewery. 
    reviews: The number of reviews for a brewery.
    ratings: The number of ratings for a brewery. (Note that reviews are different from ratings in the reviews include a text response while ratings are only numeric).
    pDev: The percent deviation of ratings for a brewery.
    b_type: The brewery type. Can include: Homebrew, Beer-to-go, Eatery, etc.
    has_phone: Boolean indicating whether a phone number is included on the website.
    comm_desc: The notes provided about the brewery. Converted to boolean later for whether or not notes are included. May contain information regarding when it was acquired by another brewery, hours of operation, etc.
    """
    
    dic = {}
    
    for brew_num in brew_nums:
        url = 'https://www.beeradvocate.com/beer/profile/' + str(brew_num) + '/'
    
        # Getting soup

        key = []
        values = []
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")

        # Finding feature values

        try:
            title_div = soup.find(class_='titleBar')
            brew_name = str(title_div.text).replace('\n','')
        except:
            brew_name = 'NA'

        try:
            score_obj = soup.find('span', {'class': 'BAscore_big'})
            score = float(re.split(r'>|<', str(score_obj))[4])
        except:
            score = np.nan

        try:   
            brew_class_obj = soup.find_all('b')[4]
            brew_class = re.split(r'>|<', str(brew_class_obj))[2]
        except:
            brew_class = 'NA'

        try:
            stats_objs = soup.find_all('dd')[5:13]
            stats_list = []
            for item in stats_objs:
                item = str(item.text).strip()
                if len(item) > 0:
                    clean_item = re.sub(r'#|%|,|/5','',item)
                    if '.' in clean_item:
                        stats_list.append(float(clean_item))
                    else:
                        stats_list.append(int(clean_item))
                else:
                    pass
            num_beers, num_beer_reviews, num_beer_ratings, brew_score, reviews, ratings, pDev = stats_list
        except:
            try:
                num_beers = int(soup.find_all('dd')[5].text)
            except:
                num_beers = np.nan
            try:
                num_beer_reviews = int(soup.find_all('dd')[6].text)
            except:
                num_beer_reviews = np.nan
            try:
                num_beer_ratings = int(soup.find_all('dd')[7].text)
            except:
                num_beer_ratings = np.nan
            
            stats_list = [np.nan, np.nan, np.nan, np.nan]

            brew_score, reviews, ratings, pDev = stats_list

        # Note that lots of data floats freely within the info_box div.
        # To access this data we convert the whole info_box div to a
        # string and split on the unique values surrounding the strings
        # we're looking for.

        info_box = soup.find('div',attrs={'id':'info_box'})

        try:
            town = soup.select("a[href*=/place/list]")[0].text
        except:
            town = 'NA'
        
        try:
            state_region_str = str(info_box).split('/">')[1]
            state_region = state_region_str.split('</a')[0]
        except:
            state_region = 'NA'
        
        try:
            country_str = str(info_box).split('/">')[-1]
            country = country_str.split('</a')[0]
        except:
            country = 'NA'
        
        try:
            b_type = re.split(r'INFO</b>\n<br/><br/>\n<b>Type:</b> |\n<br/><br/>\n',
                     str(info_box),maxsplit=4)[1]
        except:
            b_type = 'NA'

        try:
            phone_string = re.split(r'</a><br/><br/>',
                                    str(info_box),maxsplit=4)
            phone_num_string = phone_string[1].split(' |')[0]
            phone_num = re.sub(r'\(|\)| |-','',phone_num_string)
            int(phone_num)
            has_phone = True
        except:
            has_phone = False

        try:   
            comm_desc_str = str(info_box).split('Notes:</b>\n<br/>\n')[1]
            comm_desc = comm_desc_str.split('<br')[0]
        except:
            comm_desc = 'NA' 

        key = brew_name

        values = [score, brew_class, num_beers, town, state_region, country,
                  num_beer_reviews, num_beer_ratings, brew_score, 
                  reviews, ratings, pDev, b_type, has_phone, comm_desc]
        
        dic[key] = values
    
        time.sleep(np.random.poisson(10)/100)
    
    try:
        df = pd.DataFrame(dic)     
        df = df.transpose()      
        pd.DataFrame.to_pickle(df,'breweries_other')
    except:
        pass
    
    return dic

In [7]:
%%time
dic = get_brewery_info(brewery_nums)

CPU times: user 17min 1s, sys: 23.6 s, total: 17min 25s
Wall time: 2h 8min 10s


In [8]:
df = pd.read_pickle('breweries_with_loc')

In [9]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Guinness Ltd.,3.45,PLACE INFO,48,Dublin,Ireland,Ireland,8736.0,29534.0,3.92,2.0,6.0,5.61,"Brewery, Bar, Eatery, Beer-to-go",False,No notes at this time.
Ceylon / Lion Brewery Limited,3.11,PLACE INFO,6,Biyagama,Sri Lanka,Sri Lanka,,,,,,,Brewery,True,No notes at this time.
Ballast Point Brewing Company,3.91,Very Good,175,San Diego,California,United States,14505.0,67006.0,4.34,70.0,405.0,11.75,"Brewery, Bar",True,Acquired by Constellation Brands in 2015
Left Hand Brewing Company,3.78,Very Good,70,Longmont,Colorado,United States,7752.0,26511.0,4.25,47.0,179.0,12.71,"Brewery, Bar, Beer-to-go",True,No notes at this time.
Coopers Brewery Limited,2.99,PLACE INFO,16,Leabrook,Australia,Australia,,,,,,,Brewery,True,No notes at this time.
Ridgeway Brewing,3.49,PLACE INFO,29,South Stoke,England,United Kingdom,,,,,,,Brewery,True,No notes at this time.
Pike Brewing Company,3.68,Good,46,Seattle,Washington,United States,1722.0,3761.0,3.89,107.0,197.0,11.57,"Brewery, Bar, Eatery, Beer-to-go",True,No notes at this time.
Brouwerij De Dolle Brouwers,4.17,Exceptional,9,Esen,Belgium,Belgium,2525.0,4850.0,4.47,15.0,26.0,8.95,"Brewery, Bar, Eatery",True,Only open to the public on Saturday and Sunday
Modern Times Beer,4.13,PLACE INFO,289,San Diego,California,United States,2779.0,14263.0,4.38,39.0,245.0,7.53,"Brewery, Bar, Beer-to-go",True,No notes at this time.
Desnoes & Geddes Limited,2.96,Poor,15,Kingston,Jamaica,Jamaica,,,,,,,Brewery,True,No notes at this time.


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14979 entries, Guinness Ltd. to The Order of Yoni
Data columns (total 15 columns):
0     14979 non-null object
1     14979 non-null object
2     14979 non-null object
3     14979 non-null object
4     14979 non-null object
5     14979 non-null object
6     14868 non-null object
7     14741 non-null object
8     8631 non-null object
9     8631 non-null object
10    8631 non-null object
11    8631 non-null object
12    14979 non-null object
13    14979 non-null object
14    14979 non-null object
dtypes: object(15)
memory usage: 1.8+ MB


In [7]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,14979,14979,14979,14979,14979,14979,14979,14979,14979.0,14979.0,14979.0,14979.0,14979,14979,14979
unique,338,10,216,7559,258,195,986,1594,262.0,129.0,253.0,1649.0,17,2,1407
top,0,Good,1,Portland,Germany,United States,0,1,,,,,Brewery,True,No notes at this time.
freq,890,4211,1795,95,930,6857,3027,1478,6349.0,6349.0,6349.0,6349.0,6345,13116,13389


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14979 entries, Guinness Ltd. to The Order of Yoni
Data columns (total 15 columns):
0     14979 non-null object
1     14979 non-null object
2     14979 non-null object
3     14979 non-null object
4     14979 non-null object
5     14979 non-null object
6     14979 non-null object
7     14979 non-null object
8     14979 non-null object
9     14979 non-null object
10    14979 non-null object
11    14979 non-null object
12    14979 non-null object
13    14979 non-null object
14    14979 non-null object
dtypes: object(15)
memory usage: 1.8+ MB


In [9]:
df[0] = df[0].apply(float)

In [10]:
df[0].hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x11ea48320>