In [1]:
import requests
from urllib import urlopen  #urlopen will be used to request a HTML page and return its content

from bs4 import BeautifulSoup  #BeautifulSoup is a library that makes it easy to navigate in a HTML doc ie Iterate ove rows

import pandas as pd  #Pandas will only be used in the tidying up of data. pandas is a library for data manipulation and analysis
import re #This library provides regular expression matching operations

In [2]:
#Determine if a table_row is a beer entry
def is_beer_entry(table_row):
    row_cells = table_row.findAll("td") #Returns exhaustive list of table rows
    beer_id = get_beer_id(row_cells[0].text)  #determine if a row is a beer data entry is straightforward: 
    return (len(row_cells) == 8 and beer_id)  #the row needs to contain eight cells and the first cell must contain a valid numeric id.

#Return the beer entry numerical identifier from the entry column
def get_beer_id(cell_value):
    r = re.match("^(\d{1,4})\.$", cell_value)
    if r and len(r.groups()) == 1:
        beer_id = r.group(1)
        return int(beer_id)
    else:
        return None
    
    

In [3]:
def get_all_beers(html_soup):
    beers = []
    all_rows_in_html_page = html_soup.findAll("tr")
    for table_row in all_rows_in_html_page:
        if is_beer_entry(table_row):
            row_cells = table_row.findAll("td")
            beer_entry = {
                "id": get_beer_id(row_cells[0].text),
                "name": row_cells[1].text,
                "brewery_name": row_cells[2].text,
                "brewery_location": row_cells[3].text,
                "style": row_cells[4].text,
                "size": row_cells[5].text,
                "abv": row_cells[6].text,    
                "ibu": row_cells[7].text
            }
            beers.append(beer_entry)
    return beers

html = urlopen("http://craftcans.com/db.php?search=all&sort=beerid&ord=desc&view=text")
html_soup = BeautifulSoup(html, 'html.parser')
beers_list = get_all_beers(html_soup)

In [4]:
df = pd.DataFrame(beers_list)
df.head(5)


Unnamed: 0,abv,brewery_location,brewery_name,ibu,id,name,size,style
0,5.0%,"Chandler, AZ",SanTan Brewing Company,20.0,2707,Mr. Pineapple,12 oz.,Wheat Ale
1,8.2%,"Cincinnati, OH",Christian Moerlein Brewing Company,70.0,2706,Handle Bar (Current),12 oz.,American Stout
2,5.0%,"Forest, VA",Apocalypse Ale Works,18.0,2705,Golden Censer,12 oz.,Wheat Ale
3,6.2%,"Manassas, VA",Heritage Brewing Company,,2704,Freedom Isn't Free,12 oz.,American IPA
4,7.4%,"Manassas, VA",Heritage Brewing Company,115.0,2703,Kings Mountain,12 oz.,Scottish Ale


In [5]:
breweries = df[["brewery_location", "brewery_name"]]
breweries = breweries.drop_duplicates().reset_index(drop=True)
breweries["id"] = breweries.index
breweries.head(5)


Unnamed: 0,brewery_location,brewery_name,id
0,"Chandler, AZ",SanTan Brewing Company,0
1,"Cincinnati, OH",Christian Moerlein Brewing Company,1
2,"Forest, VA",Apocalypse Ale Works,2
3,"Manassas, VA",Heritage Brewing Company,3
4,"Hays, KS",Defiance Brewing Company,4


In [6]:
beers = pd.merge(df,
                 breweries,
                 left_on=["brewery_name", "brewery_location"],
                 right_on=["brewery_name", "brewery_location"],
                 sort=True,
                 suffixes=('_beer', '_brewery'))
beers = beers[["abv", "ibu", "id_beer",
               "name", "size", "style", "id_brewery"]]
beers_columns_rename = {
    "id_beer": "id",
    "id_brewery": "brewery_id"
}
beers.rename(inplace=True, columns=beers_columns_rename)
beers.head(5)

Unnamed: 0,abv,ibu,id,name,size,style,brewery_id
0,5.0%,,1436,Pub Beer,12 oz.,American Pale Lager,411
1,6.6%,,2265,Devil's Cup,12 oz.,American Pale Ale (APA),182
2,7.1%,,2264,Rise of the Phoenix,12 oz.,American IPA,182
3,9.0%,,2263,Sinister,12 oz.,American Double / Imperial IPA,182
4,7.5%,,2262,Sex and Candy,12 oz.,American IPA,182


In [8]:
breweries["city"] = breweries["brewery_location"].apply(
    lambda location: location.split(",")[0])
breweries["state"] = breweries["brewery_location"].apply(
    lambda location: location.split(",")[1])
breweries = breweries[["brewery_name", "city", "state"]]
breweries.rename(inplace=True, columns={"brewery_name": "name"})

KeyError: 'brewery_location'