In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
import time
import pickle
import math
from tqdm import tqdm
from dotenv import find_dotenv, load_dotenv
import os

load_dotenv(find_dotenv())

True

In [2]:
def start_ba_session(data, headers={}, url='https://www.beeradvocate.com/community/login/login/'):
    # Start the session
    session = requests.Session()

    # Login to BA session
    response = session.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return session
    else:
        # TODO: add error message logging.
        print("error")

def get_beer_style_dict_from_json(filename):
    # Opening JSON file 
    with open(filename)  as json_file:
        return json.load(json_file)
        

def get_beer_and_brewery_id(beer_style_dict, session=None):
    if session is None:
        session = start_ba_session(data={})
    
    beer_df = pd.DataFrame({}, columns=['beer_style', 'beer_sub_style', 'beer_id', 'brewery_id']) 

    for beer_style, beer_sub_style_dict in tqdm(beer_style_dict.items()):
        for beer_sub_style, style_id in tqdm(beer_sub_style_dict.items()):
            beer_style_dict[beer_style][beer_sub_style] = {'id': style_id}

            # Gets total number of pages
            total_beer_items = 10000

            beer_list = []
            beer_item = 1000
            while (beer_item < total_beer_items):

                beer_style_url = 'https://www.beeradvocate.com/beer/styles/' + style_id + '?sort=revsD&start=' + str(beer_item)
                
                response = session.get(beer_style_url)
                soup = BeautifulSoup(response.text, 'html.parser')
                table = soup.findAll("table")

                if beer_item == 1000:
                    data_row = table[0].findAll('tr')[0].find('b')
                    total_beer_items = int(str(data_row).split("(out of ")[1].split(") -")[0])         
                    logger.info(f'total_beer_items: {total_beer_items}')

                data_rows = table[0].findAll('tr')[3:-1]

                # Retrive segment of url link for each specific beer in the list
                for i in range(len(data_rows)):
                    brewery_id, beer_id = tuple(data_rows[i].find('a')['href'].replace('/beer/profile/', '')[:-1].split('/'))
                    
                    beer_df = beer_df.append({'beer_style': beer_style,
                                    'beer_sub_style': beer_sub_style, 
                                    'beer_id': beer_id, 
                                    'brewery_id': brewery_id}, 
                                   ignore_index=True)
                logger.info(f'length of beer list: {len(beer_list)}')

                beer_item += 50
            
    return beer_df


In [3]:
def pickle_beer_style_dict(beer_style_dict):
    # pickle dictionary
    # note: timestamp used for name of each pickled dict
    filename = 'final_beer_dict.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(beer_style_dict, f)
        f.close()
        
def csv_beer_style_dict(beer_style_dict):
    # pickle dictionary
    # note: timestamp used for name of each pickled dict

    beer_style_df = pd.DataFrame.from_dict(beer_style_dict)
    beer_style_df.to_csv()
    return beer_style_df


In [4]:
data = {
    'login': os.environ.get("BEER_ADVOCATE_LOGIN"),
    'register': os.environ.get("BEER_ADVOCATE_REGISTER"),
    'password': os.environ.get("BEER_ADVOCATE_PASSWORD"),
    'cookie_check': os.environ.get("BEER_ADVOCATE_COOKIE_CHECK"),
    '_xfToken': os.environ.get("BEER_ADVOCATE_XFTOKEN"),
    'redirect': os.environ.get("BEER_ADVOCATE_REDIRECT"),
}


session = start_ba_session(data)
beer_style_dict = get_beer_style_dict_from_json('beer_styles_dict_test.json')


beer_df = get_beer_and_brewery_id(beer_style_dict, session)

csv_beer_style_dict(beer_df)


  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:06<00:00,  6.33s/it][A
 50%|█████     | 1/2 [00:06<00:06,  6.33s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:02<00:00,  2.13s/it][A
100%|██████████| 2/2 [00:08<00:00,  4.24s/it]


Unnamed: 0,beer_style,beer_sub_style,beer_id,brewery_id
0,Bocks,Bock - Doppelbock *,387206,33007
1,Bocks,Bock - Doppelbock *,536455,48784
2,Bocks,Bock - Doppelbock *,504199,37717
3,Bocks,Bock - Doppelbock *,484489,13371
4,Bocks,Bock - Doppelbock *,298377,3768
...,...,...,...,...
719,Brown Ales,Altbier,404729,55221
720,Brown Ales,Altbier,361466,25435
721,Brown Ales,Altbier,467196,36410
722,Brown Ales,Altbier,524285,25836


In [5]:
beer_df.head()


Unnamed: 0,beer_style,beer_sub_style,beer_id,brewery_id
0,Bocks,Bock - Doppelbock *,387206,33007
1,Bocks,Bock - Doppelbock *,536455,48784
2,Bocks,Bock - Doppelbock *,504199,37717
3,Bocks,Bock - Doppelbock *,484489,13371
4,Bocks,Bock - Doppelbock *,298377,3768


In [7]:
beer_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   beer_style      724 non-null    object
 1   beer_sub_style  724 non-null    object
 2   beer_id         724 non-null    object
 3   brewery_id      724 non-null    object
dtypes: object(4)
memory usage: 22.8+ KB


In [56]:
def beer_soup(beer, session=None):
    if session is None:
        print('error')
    
    # gets beautiful soup object from beer url generated by brewery id and beer id 
    beer_url = 'https://www.beeradvocate.com/beer/profile/' + beer['brewery_id'] + '/' + beer['beer_id'] + '/'
    response = session.get(beer_url)
    return BeautifulSoup(response.text, "html.parser")


def get_beer_and_brewery_name(soup):
    name_ele = soup.findAll(class_='titleBar')
    name_ele_ = str(name_ele[0]).replace('<br/>', '+-+-+') 
    
    name_soup = BeautifulSoup(name_ele_, "html.parser")
    
    final_name_ele = name_soup.findAll(class_='titleBar')
    final_name_ele = final_name_ele[0].text.split('+-+-+')
    
    return final_name_ele[0].replace('\n', ''), final_name_ele[1].replace('\n', '')
    
    
def get_rating(soup):
    ba_score = soup.findAll(class_='ba-ravg')
    return  round(float(ba_score[0].text), 2)


def get_rating_count(soup):
    rating_count = soup. findAll(class_='ba-ratings')
    return int(rating_count[0].text.replace(',', ''))


def get_review_count(soup):
    review_count = soup. findAll(class_='ba-reviews')
    return int(review_count[0].text.replace(',', ''))


def get_ba_score(soup):    
    score_box_info = soup.find('div', {'id': 'score_box'}).find_all('span')
    for i in range(len(score_box_info)):
        if 'SCORE' in score_box_info[i].text:
            ba_score_string = score_box_info[i+1].text
            break    

    if ba_score_string == 'n/a':
        return np.nan
    return round(float(ba_score_string), 2)


def get_abv(soup):

    abv_info = soup.findAll('div', {'id':'info_box'})
    for item in abv_info:
        item = item.text.split('\n')
        len_item = len(item)
        for i in range(len_item):
            
            if 'ABV:' in item[i]:
                abv_string = item[i+1]
                break
        abv_ = abv_string.split(' ')[-1]
        if abv_ == 'n/a' or abv_ == 'listed':
            return np.nan
        return round(float(abv_.strip('%')), 2)

    
def get_beer_data(soup):
    
    beer_name, brewery_name = get_beer_and_brewery_name(soup)
    beer_rating = get_rating(soup)
    beer_rating_count = get_rating_count(soup)
    beer_review_count = get_review_count(soup)
    beer_ba_score = get_ba_score(soup)
    beer_abv = get_abv(soup)

    return {
        'name': beer_name,
        'brewery_name': brewery_name,
        'rating': beer_rating,
        'rating_count': beer_rating_count,
        'review_count': beer_review_count,
        'ba_score': beer_ba_score,
        'abv': beer_abv,
    }


In [68]:
def get_user_info_from_soup(soup):
    info_ = soup.findAll(class_='BAscore_norm')
    info_user = soup.find("div", {"id": "rating_fullview"}).findAll(class_='username')
    return info_user, info_


def create_ratings_list(beer, info_user, info_, beer_rating_id):
    temp_beer_rating_list = []
    user_list = []
    rating_list = []


    # get list of users
    for i in range(len(info_user)):
        if info_user[i].text is not '':
            user_list.append(info_user[i].text)
    user_list = user_list[:]

    # get list of ratings
    for i in range(len(info_)):
        uni_rating = float(info_[i].text)
        uni_rating2 = format(uni_rating, '.2f')
        rating_list.append(uni_rating2)


     # append users and ratings as tuple pair
    for i in range(len(user_list)):
        beer_rating_data = {
            'brewery_id': beer['brewery_id'] ,
            'beer_id': beer['beer_id'],
            'beer_rating_id': beer_rating_id
        }
        beer_rating_data['user_name'] = user_list[i]
        beer_rating_data['user_rating'] = rating_list[i]

        temp_beer_rating_list.append(beer_rating_data)
        beer_rating_id += 1
        
    return temp_beer_rating_list, beer_rating_id

In [69]:

def create_beer_meta_dataframe(beer_df, session):
    final_beer_list = []
    beer_meta_id  = 1
    for index, beer_row in tqdm(beer_df.iterrows()):
                
        soup = beer_soup(beer_row, session)
        
        beer_meta_data = {}
        beer_meta_data.update(beer_row)
        beer_meta_data.update(get_beer_data(soup))
        beer_meta_data['beer_meta_id'] = beer_meta_id

        final_beer_list.append(beer_meta_data)
        beer_meta_id += 1                

    return pd.DataFrame(final_beer_list).set_index('beer_meta_id')



def create_beer_rating_dataframe(beer_df, session):
    
    final_beer_rating_list = []
    beer_rating_id  = 1

    for index, beer_row in tqdm(beer_df.iterrows()):

        # Gets total number of pages placeholder
        total_review_items = 10000

        review_item = 1000
        review_url = 'https://www.beeradvocate.com/beer/profile/' + beer_row['brewery_id'] + '/' + beer_row['beer_id'] + '/' + '?view=beer&sort=&start=' + str(review_item)

        response = session.post(review_url, data={'hideRatings': 'N'})

        while (review_item < total_review_items):

            review_url = 'https://www.beeradvocate.com/beer/profile/' + beer_row['brewery_id'] + '/' + beer_row['beer_id'] + '/' + '?view=beer&sort=&start=' + str(review_item)
            response = session.get(review_url)

            html_doc = response.text
            soup = BeautifulSoup(html_doc, 'html.parser')

            if review_item == 1000:

                rating = soup.findAll(class_='ba-ratings')
                rc = rating[0].text
                rating_count = int(rc.replace(',',''))
                if rating_count < 50:
                    break
                    
                # divide rating by 25 and round down to get number of iterations
                ct = math.ceil(rating_count/25 )

                # return last page (intger)
                total_review_items = ct*25
            

            info_user, info_ = get_user_info_from_soup(soup)
            temp_beer_rating_list, beer_rating_id = create_ratings_list(beer_row, info_user, info_, beer_rating_id)
            final_beer_rating_list = final_beer_rating_list + temp_beer_rating_list

            review_item += 25
            break
    return pd.DataFrame(final_beer_rating_list).set_index('beer_rating_id')


In [70]:
beer_meta_df = create_beer_meta_dataframe(beer_df, session)

14it [00:04,  3.41it/s]


KeyboardInterrupt: 

In [71]:
beer_meta_df

Unnamed: 0_level_0,beer_style,beer_sub_style,beer_id,brewery_id,name,brewery_name,rating,rating_count,review_count,ba_score,abv
beer_meta_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Bocks,Bock - Doppelbock *,387206,33007,DB18,Berging Brouwerij,3.52,1,1,,7.6
2,Bocks,Bock - Doppelbock *,536455,48784,Orator,Brewery Silvaticus,4.59,1,1,,7.0
3,Bocks,Bock - Doppelbock *,504199,37717,Penetrator - Bourbon barrel aged,Territorial Brewing Company,4.50,1,0,,10.4
4,Bocks,Bock - Doppelbock *,484489,13371,Apple Pie Doppelbock Aged In Apple Brandy Barrels,Voodoo Brewery,4.41,1,0,,9.6
5,Bocks,Bock - Doppelbock *,298377,3768,Modulator Doppelbock,Lunar Brewing,3.53,1,1,,7.0
...,...,...,...,...,...,...,...,...,...,...,...
720,Brown Ales,Altbier,404729,55221,Fortuna,Hop Oast Pub & Brewery,0.00,0,0,,5.1
721,Brown Ales,Altbier,361466,25435,Remember The A.L.A.M.O,Turkey Hill Brewing Co. Pub & Grille,0.00,0,0,,5.9
722,Brown Ales,Altbier,467196,36410,Altbier,Blue Collar Brewery Inc.,0.00,0,0,,5.5
723,Brown Ales,Altbier,524285,25836,Altbier,Portland U-Brew & Pub (P.U.B.) / Unicorn Brewing,0.00,0,0,,5.5


In [72]:
beer_rating_df = create_beer_rating_dataframe(beer_df, session)

0 beer_style                      Bocks
beer_sub_style    Bock - Doppelbock *
beer_id                        387206
brewery_id                      33007
Name: 0, dtype: object
------------------------> 1000 / 10000
1
[<a class="username" href="/community/members/zet.982225/"><img alt="Photo of Zet" border="0" height="48" src="styles/default/xenforo/avatars/avatar_male_s.png" width="48"/></a>, <a class="username" href="/community/members/zet.982225/">Zet</a>]
1 beer_style                      Bocks
beer_sub_style    Bock - Doppelbock *
beer_id                        536455
brewery_id                      48784
Name: 1, dtype: object
------------------------> 1000 / 10000
1
[<a class="username" href="/community/members/trevorpost.239643/"><img alt="Photo of trevorpost" border="0" height="48" src="https://cdn.beeradvocate.com/data/avatars/s/239/239643.jpg?1432609524" width="48"/></a>, <a class="username" href="/community/members/trevorpost.239643/">trevorpost</a>]
2 beer_style           

In [73]:
beer_rating_df

Unnamed: 0_level_0,brewery_id,beer_id,user_name,user_rating
beer_rating_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,33007,387206,Zet,3.52
2,48784,536455,trevorpost,4.59
3,37717,504199,PapaGoose03,4.5
4,13371,484489,wvsabbath,4.41
5,3768,298377,BlackBeerPirate,3.53
