# [Beer Advocate](https://www.beeradvocate.com/) crawler

In [1]:
import pandas as pd
import numpy as np
from requests_html import HTMLSession

In [2]:
# Start HTML session
session = HTMLSession()

In [3]:
# Beer Advocate pagination url template
URL_TEMPLATE = 'https://www.beeradvocate.com/beer/?start=%d'

In [4]:
# Beer Advocate data frame# Beer  
beer_advocate_df = pd.DataFrame(
    columns = ['beer_name', 'beer_rate', 'beer_url'],
    dtype   = np.dtype('str', 'float', 'str')
)

In [5]:
# Check if the page is the last# Check  
def is_end(html):
    el = html.xpath("//div[@class='mainContent']/div[@id='ba-content']/text()[normalize-space()]")
    if len(el) > 0 and el[0].find("Invalid"):
        return True
    
    return False

In [6]:
r = session.get('https://www.beeradvocate.com/beer/?start=0')

In [7]:
# Fetch all beers# Fetch  
page = 0
while True:
    r = session.get(URL_TEMPLATE % page)
    beers_name = r.html.xpath("//div[@id='rating_fullview_content_2']/h6/a/text()")
    beers_url = r.html.xpath("//div[@id='rating_fullview_content_2']/h6/a/@href")
    beers_rate = r.html.xpath("//span[@class='BAscore_norm']/text()")

    fetched_beers = list(zip(beers_name, [r for r in beers_rate if r != ' '], beers_url))
    fetched_beers_df = pd.DataFrame({
        'beer_name': pd.Series([x[0] for x in fetched_beers]),
        'beer_rate': pd.Series([x[1] for x in fetched_beers]),
        'beer_url' : pd.Series(['https://www.beeradvocate.com' + x[2] for x in fetched_beers]),
    })
    
    beer_advocate_df = pd.concat([beer_advocate_df, fetched_beers_df])
    
    # exit pagination
    if is_end(r.html):
        break
        
    page += 25

In [8]:
# Remove duplicates and get max rate# Remove 
beer_advocate_df = beer_advocate_df.sort_values('beer_rate', ascending=False).drop_duplicates('beer_name').sort_index().reset_index(drop=True)
beer_advocate_df.head()

Unnamed: 0,beer_name,beer_rate,beer_url
0,MILF,4.35,https://www.beeradvocate.com/beer/profile/2488...
1,Capra Noastră Saison,3.55,https://www.beeradvocate.com/beer/profile/5119...
2,SR-71,5.0,https://www.beeradvocate.com/beer/profile/2322...
3,Chimay Grande Réserve Oak Cognac,3.99,https://www.beeradvocate.com/beer/profile/215/...
4,Upland / Cascade - Pearpawsterous,2.97,https://www.beeradvocate.com/beer/profile/1170...


In [9]:
# Dataset sieze
beer_advocate_df.shape

(233, 3)

In [10]:
# Write CSV# Write  
beer_advocate_df.to_csv('beer_advocate.csv', sep=';')