### Beer Engine

Notebook 1: Webscraper

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import csv
import time

#### Part 1 - Get links for each beer

1. Compile dictionaries of beer categories/subcategories
2. Create dictionary of beer links per subcategory

In [None]:
def get_list(link):
    
    """ 
    This function returns a list of websites for each beer in the input, 'link', where 
    'link' is the website for the 'top 250' beer list webpage. 
    
    INPUT: str (from the above dictionaries - 'dict[key]')
    OUTPUT: list
    """
    
    response = requests.get(link)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    table = soup.findAll("table")
    data_rows = table[0].findAll('tr')[2:]
    # Retrive segment of url link for each specific beer in the "top" list
    html_sublist=[]
    for i in range(len(data_rows)):
        link = data_rows[i].find('a')['href']
        html_sublist.append(link)
    # concatenate first part and second part of url to get to reviews page of each beer
    fulllist = ['https://www.beeradvocate.com' + html_sublist[i] +'?view=beer&sort=&start=' for i in range(len(html_sublist))]        
    return fulllist

In [None]:
def compile_list(dic):
    """
    compiles list of links from dictionary of subcategory of beer type
    Input: dictionary
    Output: dictionary
    
    """
    beers_dict = dict()
    temp_list = list(dic.keys())
    ts = time.time()
    
    for i in range(len(dic)):
        beers_dict[temp_list[i]] = get_list(dic[temp_list[i]]) # call get_list function
    
    # pickle dictionary
    # note: timestamp used for name of each pickled dict
    filename = str(ts)+'_dict.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(beers_dict, f)
        f.close()
        
    # return object
    return beers_dict

#### Part 2 - Scrape every beer link

After getting the link for each unqiue beer, the following section will scrape the following information for each beer:

beer name  
brewery name  
beer subcategory  
overall rating  
number of ratings  
number of reviews  
alcohol percentage (ABV)
reviews

In [None]:
def access_url(beer_type):
    """ access url and retrieve bs4 lxml object"""
    new_url = beer_type
    new_response = requests.get(new_url)
    new_page = new_response.text
    new_soup = BeautifulSoup(new_page, "lxml")
    return new_soup


def get_beer_info(new_soup):
    """ retrieve name of beer, name of brewery"""  
    name = new_soup.findAll(class_='titleBar')
    name = name[0].text.replace('\n','').split('|')
    beer_name = name[0]
    brewery_name = name[1]
    return beer_name, brewery_name


def get_rating(new_soup):
    """ retrieve rating of beer"""   
    ba_score = new_soup.findAll(class_='ba-ravg')
    rating = ba_score[0].text
    return rating


def get_num_rating(new_soup):
    """retrieve no. of ratings"""   
    num_ratings = new_soup.findAll(class_='ba-ratings')
    num_ratings = num_ratings[0].text
    return num_ratings


def get_num_reviews(new_soup):
    """ retrieve no. of reviews """   
    num_reviews = new_soup.findAll(class_='ba-reviews')
    num_reviews = num_reviews[0].text
    return num_reviews 


def get_abv(new_soup):
    """ retrieve abv"""
    letters = 'abcdefghijklmnopqrstuwvxyz'
    abv_info = new_soup.findAll('div', {'id':'info_box'})
    for item in abv_info:
        ary = item.text.split('\n')
        for i in ary:
            if 'Alcohol by volume' in i:
                abv_string = i
                abv_ = abv_string.split(' ')[-1]
                chars = list(abv_)
                if chars[-2] not in letters:
                    abv = round(float(abv_.strip('%')),2)
                    return abv
                else:
                    return 'N/A'

In [None]:
def get_info(beer_dict):

    """
    function to scrape key info and append to csv
    """ 
    compiled_list =[]
    
    dict_keys = list(beer_dict.keys())
    for i in dict_keys:
        list_ = beer_dict[i]  # list of beers from dict_key "i"
        new_list = [x + '0' for x in list_]  # add 0 to end of website link to get to first page
        # loop through each link in new_list
        for link in new_list:
            url = access_url(link)
            beer_name, brewery_name = get_beer_info(url)
            rating = get_rating(url)
            num_ratings = get_num_rating(url)
            num_reviews = get_num_reviews(url)
            alc = get_abv(url)
            # get name of dictionary list to represent which category of beer
            beer_sub_cat = i 
            beerlist = [beer_sub_cat,beer_name, brewery_name, rating, num_ratings, num_reviews, alc]
            
            print(beerlist)
            compiled_list.append(beerlist)
            
            # insert data into mongoDB
            beer_dict_db = {}
            beer_dict_db['sub_cat'] = beer_sub_cat
            beer_dict_db['beer_name'] = beer_name
            beer_dict_db['brewery_name'] = brewery_name
            beer_dict_db['rating'] = rating
            beer_dict_db['num_ratings'] = num_ratings
            beer_dict_db['num_reviews'] = num_reviews
            beer_dict_db['alc'] = alc
            client.beer_db.beer_collection.insert_one(beer_dict_db)
            print('mongo insert complete')
            
    return compiled_list

#### Part 3 - Scrape User Ratings

Scrape all user review(rating) for each unique beer. This is different than Part 2. Part 2 extracts all relevant info on the first page of each unique beer. Part 3 iterates through each page to extract all user - rating pairings. 

In [None]:
user_reviews_assorted = beer_db['user_reviews_assorted'] 

In [None]:
def find_rate_count(new_soup):
    """ 
    find number of ratings to determine number of iterations 
    input: bs4 object
    output: integer
    """
    
    rating = new_soup.findAll(class_='ba-ratings')
    rc = rating[0].text
    rating_count = int(rc.replace(',',''))
    # divide rating by 25 and round down to get number of iterations
    ct = rating_count//25 
    # return last page (intger)
    page_end_ = ct*25
    return page_end_


def get_rating_reviewer(soup_):
    """
    retreive all of the reviewer - rating pairs on each webpage
    
    Input: bs4 object 
    Output: list of 25 username, rating tuple pairs
    """

    info_ = soup_.findAll(class_='BAscore_norm')
    info_user = soup_.findAll(class_='username')
    tuple_list = []
    user_list = []
    rating_list = []
    
    # get list of users
    for i in range(len(info_user)):
        if info_user[i].text is not '':
            user_list.append(info_user[i].text)
    user_list = user_list[1:]
    
    # get list of ratings
    for i in range(len(info_)):
        uni_rating = float(info_[i].text)
        uni_rating2 = format(uni_rating, '.2f')
        rating_list.append(uni_rating2)
    
    # append users and ratings as tuple pair
    for i in range(len(user_list)):
        t = user_list[i], rating_list[i]
        tuple_list.append(t)

    return tuple_list

In [None]:
def rating_reviewers(beer_dict):
    '''
    Get ratings and reviewers
    
    '''
    # initiate master dictionary 
    user_reviews_beer_dict={}
    beer_tuple_list=[] 
    
    dict_keys = list(beer_dict.keys())

    for i in dict_keys:
        list_ = beer_dict[i]  # list of beers from dict_key "i"
        for link in list_:
            letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            start = 0
            page_end = 2 #integer representing the last page, 2 is placeholder  
            count = 0

            while count <= page_end:
                if count ==0:
                    time.sleep(1)
                    first_page = link+'0'
                    url = access_url(first_page)

                    # find number of ratings to determine number of iterations (turn this into function)
                    page_end = find_rate_count(url) 

                    # get name of beer
                    beer_name_, brewery_name = get_beer_info(url)

                    beer_name = beer_name_.replace('.','').replace('$','')
                    
                    # get individual rating, get reviewer name
                    tup = get_rating_reviewer(url)
                    
                    # initiate new dict key-value
                    user_reviews_beer_dict[beer_name]=[]
                    # append to dict
                    for item in tup:
                        user_reviews_beer_dict[beer_name].append(item)                    
                    count +=25

                else:
                    other_pages = link + str(count)
                    url = access_url(other_pages)

                    # get individual rating, get reviewer name
                    tup = get_rating_reviewer(url)
                    
                    # append to dict
                    for item in tup:
                        user_reviews_beer_dict[beer_name].append(item)
                    count +=25
    
    
    # extract data and insert into MongoDB
    all_reviews = []

    for key in list(user_reviews_beer_dict.keys()):
        d = {}
        d['Beer'] = key
        d['UserReviews'] = user_reviews_beer_dict[key]
        all_reviews.append(d)

    
    for item in all_reviews:
        client.beer_db.user_reviews_assorted.insert_one(item)
    
    # return dictionary object
    return user_reviews_beer_dict
