In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import pandas as pd
import pickle
import os

In [3]:
os.chdir('../../Beergression_Data_and_misc/Data/')

1. Gets a list of all of the names of files in the Data2/ folder.  
* Other files have been saved to this location so we check to make sure that the file is actually an output of our beer scraper function. 
    * All of the outputted pickles have been saved with integer names. 
    * We check to see if the name of the file can be converted to an int and, if so, append it to our style nums list.  
* That list is used to open and concatenate all of the pickles for scraped beer data.  
* Finally we take the unique Brewery Numbers from the `Brewery_Nums` column of our DataFrame. This is the list of all brewery urls on Beer Advocate and will be used in the below function.

In [3]:
dir_files = os.listdir()

style_nums = []
for file in dir_files:
    try:
        int(file)
        style_nums.append(file)
    except:
        pass
    
df = pd.DataFrame(columns = ['beer_name','brewery_name','abv',
                             'ratings','score','brewery_nums'])

for num in style_nums:
    pickled = pd.read_pickle(num)
    df = pd.concat([df,pickled],sort=True)

In [5]:
brewery_nums = df['brewery_nums'].unique()

In [6]:
def get_brewery_info(brew_nums):
    
    """
    Takes a list of Brewery URLs for https://www.beeradvocate.com and scrapes each page and stores the contents in a dictionary. 
    The function will attempt to save the dictionary as a pandas DataFrame.
    
    The following are the values that will be captured from each brewery page:
    
    score: The average of all beers with ratings for that brewery
    brew_class: Categical ranking of Brewery. Outstanding, Good, Okay, etc.
    num_beers: The number of beers produced by the brewery and listed on beer advocate.
    town: The town where the brewery is located. (This will return city for international breweries).
    state_region: The state where the brewery is located. (This will return country for internatinoal breweries).
    country: The country where the brewery is located.
    num_beer_reviews: Total number of reviews for all beers for a brewery.
    num_beer_ratings: Total number of ratings for all beers for a brewery. (Note that reviews are different from ratings in that reviews include a text response while ratings are only numeric).
    brew_score: Our target. The average score across all ratings for a brewery. 
    reviews: The number of reviews for a brewery.
    ratings: The number of ratings for a brewery. (Note that reviews are different from ratings in the reviews include a text response while ratings are only numeric).
    pDev: The percent deviation of ratings for a brewery.
    b_type: The brewery type. Can include: Homebrew, Beer-to-go, Eatery, etc.
    has_phone: Boolean indicating whether a phone number is included on the website.
    comm_desc: The notes provided about the brewery. Converted to boolean later for whether or not notes are included. May contain information regarding when it was acquired by another brewery, hours of operation, etc.
    """
    
    dic = {}
    
    for brew_num in brew_nums:
        url = 'https://www.beeradvocate.com/beer/profile/' + str(brew_num) + '/'
    
        # Getting soup

        key = []
        values = []
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")

        # Finding feature values

        try:
            title_div = soup.find(class_='titleBar')
            brew_name = str(title_div.text).replace('\n','')
        except:
            brew_name = 'NA'

        try:
            score_obj = soup.find('span', {'class': 'BAscore_big'})
            score = float(re.split(r'>|<', str(score_obj))[4])
        except:
            score = np.nan

        try:   
            brew_class_obj = soup.find_all('b')[4]
            brew_class = re.split(r'>|<', str(brew_class_obj))[2]
        except:
            brew_class = 'NA'

        try:
            stats_objs = soup.find_all('dd')[5:13]
            stats_list = []
            for item in stats_objs:
                item = str(item.text).strip()
                if len(item) > 0:
                    clean_item = re.sub(r'#|%|,|/5','',item)
                    if '.' in clean_item:
                        stats_list.append(float(clean_item))
                    else:
                        stats_list.append(int(clean_item))
                else:
                    pass
            num_beers, num_beer_reviews, num_beer_ratings, brew_score, reviews, ratings, pDev = stats_list
        except:
            try:
                num_beers = int(soup.find_all('dd')[5].text)
            except:
                num_beers = np.nan
            try:
                num_beer_reviews = int(soup.find_all('dd')[6].text)
            except:
                num_beer_reviews = np.nan
            try:
                num_beer_ratings = int(soup.find_all('dd')[7].text)
            except:
                num_beer_ratings = np.nan
            
            stats_list = [np.nan, np.nan, np.nan, np.nan]

            brew_score, reviews, ratings, pDev = stats_list

        # Note that lots of data floats freely within the info_box div.
        # To access this data we convert the whole info_box div to a
        # string and split on the unique values surrounding the strings
        # we're looking for.

        info_box = soup.find('div',attrs={'id':'info_box'})

        try:
            town = soup.select("a[href*=/place/list]")[0].text
        except:
            town = 'NA'
        
        try:
            state_region_str = str(info_box).split('/">')[1]
            state_region = state_region_str.split('</a')[0]
        except:
            state_region = 'NA'
        
        try:
            country_str = str(info_box).split('/">')[-1]
            country = country_str.split('</a')[0]
        except:
            country = 'NA'
        
        try:
            b_type = re.split(r'INFO</b>\n<br/><br/>\n<b>Type:</b> |\n<br/><br/>\n',
                     str(info_box),maxsplit=4)[1]
        except:
            b_type = 'NA'

        try:
            phone_string = re.split(r'</a><br/><br/>',
                                    str(info_box),maxsplit=4)
            phone_num_string = phone_string[1].split(' |')[0]
            phone_num = re.sub(r'\(|\)| |-','',phone_num_string)
            int(phone_num)
            has_phone = True
        except:
            has_phone = False

        try:   
            comm_desc_str = str(info_box).split('Notes:</b>\n<br/>\n')[1]
            comm_desc = comm_desc_str.split('<br')[0]
        except:
            comm_desc = 'NA' 

        key = brew_name

        values = [score, brew_class, num_beers, town, state_region, country,
                  num_beer_reviews, num_beer_ratings, brew_score, 
                  reviews, ratings, pDev, b_type, has_phone, comm_desc]
        
        dic[key] = values
    
        time.sleep(np.random.poisson(10)/100)
    
    try:
        df = pd.DataFrame(dic)     
        df = df.transpose()      
        pd.DataFrame.to_pickle(df,'breweries_other')
    except:
        pass
    
    return dic

In [7]:
%%time
dic = get_brewery_info(brewery_nums)

CPU times: user 17min 1s, sys: 23.6 s, total: 17min 25s
Wall time: 2h 8min 10s
