In [1]:
#
# Data Below is sourced from Kaggle.com, specifically at this URL:
# https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews
# 3 Files:
#   beers.csv
#   breweries.csv
#   reviews.csv
#
# reviews.csv is rather large, so we will prune it down to what we need
#
import pandas as pd
import numpy as np
import zipfile
import os 

In [2]:
# This is a large file, will need to prune it
df = pd.read_csv("../Resources/reviews.csv")

In [3]:
# Get the column names to see what is here
df.columns

Index(['beer_id', 'username', 'date', 'text', 'look', 'smell', 'taste', 'feel',
       'overall', 'score'],
      dtype='object')

In [4]:
# Column types to ensure proper analysis
print("types of each columns: \n\n",df.dtypes)
print("\ninformation of the columns: \n")
print(df.info())

types of each columns: 

 beer_id       int64
username     object
date         object
text         object
look        float64
smell       float64
taste       float64
feel        float64
overall     float64
score       float64
dtype: object

information of the columns: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9073128 entries, 0 to 9073127
Data columns (total 10 columns):
 #   Column    Dtype  
---  ------    -----  
 0   beer_id   int64  
 1   username  object 
 2   date      object 
 3   text      object 
 4   look      float64
 5   smell     float64
 6   taste     float64
 7   feel      float64
 8   overall   float64
 9   score     float64
dtypes: float64(6), int64(1), object(3)
memory usage: 692.2+ MB
None


In [5]:
# None of the text is '' but has funny characters in it when it should be empty
df['text'].replace('\xa0\xa0', np.nan, inplace=True)

In [6]:
print("Overview of missing values in the dataset: \n",df.isnull().sum())

Overview of missing values in the dataset: 
 beer_id           0
username       3815
date              0
text        6085135
look        3790018
smell       3790018
taste       3790018
feel        3790018
overall     3790018
score             0
dtype: int64


In [7]:
# Drop the na values
df=df.dropna()
print("After dropping the missing values: \n",df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2822618 entries, 0 to 9073114
Data columns (total 10 columns):
 #   Column    Dtype  
---  ------    -----  
 0   beer_id   int64  
 1   username  object 
 2   date      object 
 3   text      object 
 4   look      float64
 5   smell     float64
 6   taste     float64
 7   feel      float64
 8   overall   float64
 9   score     float64
dtypes: float64(6), int64(1), object(3)
memory usage: 236.9+ MB
After dropping the missing values: 
 None


In [8]:
# Check to see if any left
df.isnull().sum()

beer_id     0
username    0
date        0
text        0
look        0
smell       0
taste       0
feel        0
overall     0
score       0
dtype: int64

In [9]:
# Let's check for duplication amongst reviews
df[df.duplicated(['beer_id', 'username'])]

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score


In [10]:
# Quick check of metrics
round(df.describe(),2)

Unnamed: 0,beer_id,look,smell,taste,feel,overall,score
count,2822618.0,2822618.0,2822618.0,2822618.0,2822618.0,2822618.0,2822618.0
mean,60379.0,3.9,3.81,3.85,3.82,3.86,3.84
std,73862.51,0.59,0.66,0.7,0.65,0.67,0.61
min,3.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,4202.0,3.5,3.5,3.5,3.5,3.5,3.56
50%,37696.0,4.0,4.0,4.0,4.0,4.0,3.95
75%,79537.0,4.25,4.25,4.5,4.25,4.25,4.24
max,372914.0,5.0,5.0,5.0,5.0,5.0,5.0


In [11]:
len(df)

2822618

In [12]:
# Read in the beer data
beer_df = pd.read_csv('../Resources/beers.csv')
beer_df.head()

Unnamed: 0,id,name,brewery_id,state,country,style,availability,abv,notes,retired
0,202522,Olde Cogitator,2199,CA,US,English Oatmeal Stout,Rotating,7.3,No notes at this time.,f
1,82352,Konrads Stout Russian Imperial Stout,18604,,NO,Russian Imperial Stout,Rotating,10.4,No notes at this time.,f
2,214879,Scottish Right,44306,IN,US,Scottish Ale,Year-round,4.0,No notes at this time.,t
3,320009,MegaMeow Imperial Stout,4378,WA,US,American Imperial Stout,Winter,8.7,Every time this year,f
4,246438,Peaches-N-Cream,44617,PA,US,American Cream Ale,Rotating,5.1,No notes at this time.,f


In [13]:
len(beer_df)

358873

In [14]:
# As we said review data is quite large, let's restrict this to US market
us_beer_df = beer_df[beer_df['country']== 'US']
us_beer_df.head()

Unnamed: 0,id,name,brewery_id,state,country,style,availability,abv,notes,retired
0,202522,Olde Cogitator,2199,CA,US,English Oatmeal Stout,Rotating,7.3,No notes at this time.,f
2,214879,Scottish Right,44306,IN,US,Scottish Ale,Year-round,4.0,No notes at this time.,t
3,320009,MegaMeow Imperial Stout,4378,WA,US,American Imperial Stout,Winter,8.7,Every time this year,f
4,246438,Peaches-N-Cream,44617,PA,US,American Cream Ale,Rotating,5.1,No notes at this time.,f
6,108605,Icon Sender,22598,CA,US,American Lager,Year-round,5.6,No notes at this time.,f


In [15]:
# Merge the beer and review data
beer_review_merged_df = pd.merge(df, us_beer_df, how="inner", left_on="beer_id", right_on="id")
len(beer_review_merged_df)

2190010

In [16]:
beer_review_merged_df.head()

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score,id,name,brewery_id,state,country,style,availability,abv,notes,retired
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03,271781,Motorbreath Imperial Stout,28094,OH,US,American Imperial Stout,Limited (brewed once),10.8,2016 - Five Year Anniversary Imperial Stout,t
1,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58,125646,Haze,28743,MA,US,New England IPA,Rotating,8.2,We constructed this beer around hops we curren...,f
2,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31,125646,Haze,28743,MA,US,New England IPA,Rotating,8.2,We constructed this beer around hops we curren...,f
3,125646,MFMB,2017-12-16,Pours a creamy opaque light straw yellow wi...,4.75,4.5,4.5,4.5,4.5,4.52,125646,Haze,28743,MA,US,New England IPA,Rotating,8.2,We constructed this beer around hops we curren...,f
4,125646,jngrizzaffi,2017-12-10,Pours a cloudy yellow color with a thin foa...,4.5,4.5,4.5,4.75,4.5,4.53,125646,Haze,28743,MA,US,New England IPA,Rotating,8.2,We constructed this beer around hops we curren...,f


In [17]:
beer_review_merged_df.columns

Index(['beer_id', 'username', 'date', 'text', 'look', 'smell', 'taste', 'feel',
       'overall', 'score', 'id', 'name', 'brewery_id', 'state', 'country',
       'style', 'availability', 'abv', 'notes', 'retired'],
      dtype='object')

In [18]:
# It is now time to trim the data we no longer want (text)
trimmed_df = beer_review_merged_df[['beer_id', 'username', 'date', 'look', 'smell', 'taste', 'feel',
       'overall', 'score', 'name', 'brewery_id', 'state','style', 'availability', 'abv']]

In [19]:
# Read the brewery data
brewery_df = pd.read_csv('../Resources/breweries.csv')
brewery_df.head()

Unnamed: 0,id,name,city,state,country,notes,types
0,19730,Brouwerij Danny,Erpe-Mere,,BE,No notes at this time.,Brewery
1,32541,Coachella Valley Brewing Co,Thousand Palms,CA,US,No notes at this time.,"Brewery, Bar, Beer-to-go"
2,44736,Beef 'O' Brady's,Plant City,FL,US,No notes at this time.,"Bar, Eatery"
3,23372,Broadway Wine Merchant,Oklahoma City,OK,US,No notes at this time.,Store
4,35328,Brighton Beer Dispensary (DUPLICATE),Brighton,GB2,GB,Duplicate of https://www.beeradvocate.com/beer...,"Bar, Eatery"


In [20]:
# Trim down to US breweries only

us_brewery_df = brewery_df[brewery_df['country']== 'US']
us_brewery_df.head()

Unnamed: 0,id,name,city,state,country,notes,types
1,32541,Coachella Valley Brewing Co,Thousand Palms,CA,US,No notes at this time.,"Brewery, Bar, Beer-to-go"
2,44736,Beef 'O' Brady's,Plant City,FL,US,No notes at this time.,"Bar, Eatery"
3,23372,Broadway Wine Merchant,Oklahoma City,OK,US,No notes at this time.,Store
5,31561,Teddy's Tavern,Seattle,WA,US,No notes at this time.,"Bar, Beer-to-go"
9,41278,The Other End,Destin,FL,US,No notes at this time.,"Bar, Eatery"


In [21]:
# Now lets merge all three datasets togther
brewery_beer_review_merged_df = pd.merge(trimmed_df, us_brewery_df, how="inner", left_on="brewery_id", right_on="id")
len(brewery_beer_review_merged_df)

2190010

In [22]:
brewery_beer_review_merged_df.head()

Unnamed: 0,beer_id,username,date,look,smell,taste,feel,overall,score,name_x,...,style,availability,abv,id,name_y,city,state_y,country,notes,types
0,271781,bluejacket74,2017-03-17,4.0,4.0,4.0,4.25,4.0,4.03,Motorbreath Imperial Stout,...,American Imperial Stout,Limited (brewed once),10.8,28094,Four String Brewing Company,Columbus,OH,US,No notes at this time.,"Brewery, Bar"
1,184647,Try-em-all,2017-09-27,4.0,4.0,3.5,3.5,3.75,3.7,Payback Pilsner,...,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,US,No notes at this time.,"Brewery, Bar"
2,184647,ScorpioBeerLover,2017-03-22,4.0,4.0,4.25,3.75,4.0,4.08,Payback Pilsner,...,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,US,No notes at this time.,"Brewery, Bar"
3,184647,beergoot,2016-10-08,3.25,3.25,3.5,3.5,3.5,3.43,Payback Pilsner,...,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,US,No notes at this time.,"Brewery, Bar"
4,184647,woodychandler,2016-09-25,3.75,3.75,3.5,3.5,3.5,3.58,Payback Pilsner,...,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,US,No notes at this time.,"Brewery, Bar"


In [23]:
brewery_beer_review_merged_df.columns

Index(['beer_id', 'username', 'date', 'look', 'smell', 'taste', 'feel',
       'overall', 'score', 'name_x', 'brewery_id', 'state_x', 'style',
       'availability', 'abv', 'id', 'name_y', 'city', 'state_y', 'country',
       'notes', 'types'],
      dtype='object')

In [24]:
# Rename some key columns for better data definition
brewery_beer_review_merged_df = brewery_beer_review_merged_df.rename(columns={'name_y':'brewery_name', 'name_x':'beer_name','state_x':'review_state', 'state_y':'brewery_state'})

In [25]:
brewery_beer_review_merged_df.columns

Index(['beer_id', 'username', 'date', 'look', 'smell', 'taste', 'feel',
       'overall', 'score', 'beer_name', 'brewery_id', 'review_state', 'style',
       'availability', 'abv', 'id', 'brewery_name', 'city', 'brewery_state',
       'country', 'notes', 'types'],
      dtype='object')

In [26]:
# Drop unwanted columns
brewery_beer_review_merged_df = brewery_beer_review_merged_df.drop(['brewery_id', 'notes', 'country'], axis=1)

In [27]:
brewery_beer_review_merged_df.columns

Index(['beer_id', 'username', 'date', 'look', 'smell', 'taste', 'feel',
       'overall', 'score', 'beer_name', 'review_state', 'style',
       'availability', 'abv', 'id', 'brewery_name', 'city', 'brewery_state',
       'types'],
      dtype='object')

In [28]:
# Now that we dropped some columns let rename to be more descriptive
brewery_beer_review_merged_df = brewery_beer_review_merged_df.rename(columns={'id':'brewery_id', 'city':'brewery_city', 'types':'brewery_types'})

In [29]:
brewery_beer_review_merged_df.columns

Index(['beer_id', 'username', 'date', 'look', 'smell', 'taste', 'feel',
       'overall', 'score', 'beer_name', 'review_state', 'style',
       'availability', 'abv', 'brewery_id', 'brewery_name', 'brewery_city',
       'brewery_state', 'brewery_types'],
      dtype='object')

In [30]:
brewery_beer_review_merged_df.head()

Unnamed: 0,beer_id,username,date,look,smell,taste,feel,overall,score,beer_name,review_state,style,availability,abv,brewery_id,brewery_name,brewery_city,brewery_state,brewery_types
0,271781,bluejacket74,2017-03-17,4.0,4.0,4.0,4.25,4.0,4.03,Motorbreath Imperial Stout,OH,American Imperial Stout,Limited (brewed once),10.8,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
1,184647,Try-em-all,2017-09-27,4.0,4.0,3.5,3.5,3.75,3.7,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
2,184647,ScorpioBeerLover,2017-03-22,4.0,4.0,4.25,3.75,4.0,4.08,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
3,184647,beergoot,2016-10-08,3.25,3.25,3.5,3.5,3.5,3.43,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
4,184647,woodychandler,2016-09-25,3.75,3.75,3.5,3.5,3.5,3.58,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"


In [31]:
# Do an additional NULL check now that we merged
brewery_beer_review_merged_df.isnull().sum()

beer_id              0
username             0
date                 0
look                 0
smell                0
taste                0
feel                 0
overall              0
score                0
beer_name            0
review_state     12230
style                0
availability         0
abv              51263
brewery_id           0
brewery_name         0
brewery_city     12230
brewery_state    12230
brewery_types        0
dtype: int64

In [32]:
# Nulls don't look significant enough let's drop them
brewery_beer_review_merged_df = brewery_beer_review_merged_df.dropna()

In [33]:
# Create the file paths
csv_file_path = os.path.join('../Resources',"reviews_beer_brewery.csv")
zip_file_path = os.path.join('../Resources',"reviews_beer_brewery.zip")

In [34]:
# Save uncompressed dataframe to zip
brewery_beer_review_merged_df.to_csv(csv_file_path, index=False)

In [35]:
# Compress the final output
zout = zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED)
zout.write(csv_file_path)
zout.close()

In [36]:
# Make sure we can read the zip file just created
zip_df = pd.read_csv(zip_file_path)
zip_df.head()

Unnamed: 0,beer_id,username,date,look,smell,taste,feel,overall,score,beer_name,review_state,style,availability,abv,brewery_id,brewery_name,brewery_city,brewery_state,brewery_types
0,271781,bluejacket74,2017-03-17,4.0,4.0,4.0,4.25,4.0,4.03,Motorbreath Imperial Stout,OH,American Imperial Stout,Limited (brewed once),10.8,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
1,184647,Try-em-all,2017-09-27,4.0,4.0,3.5,3.5,3.75,3.7,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
2,184647,ScorpioBeerLover,2017-03-22,4.0,4.0,4.25,3.75,4.0,4.08,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
3,184647,beergoot,2016-10-08,3.25,3.25,3.5,3.5,3.5,3.43,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"
4,184647,woodychandler,2016-09-25,3.75,3.75,3.5,3.5,3.5,3.58,Payback Pilsner,OH,Bohemian Pilsener,Rotating,5.1,28094,Four String Brewing Company,Columbus,OH,"Brewery, Bar"


In [37]:
zip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127677 entries, 0 to 2127676
Data columns (total 19 columns):
 #   Column         Dtype  
---  ------         -----  
 0   beer_id        int64  
 1   username       object 
 2   date           object 
 3   look           float64
 4   smell          float64
 5   taste          float64
 6   feel           float64
 7   overall        float64
 8   score          float64
 9   beer_name      object 
 10  review_state   object 
 11  style          object 
 12  availability   object 
 13  abv            float64
 14  brewery_id     int64  
 15  brewery_name   object 
 16  brewery_city   object 
 17  brewery_state  object 
 18  brewery_types  object 
dtypes: float64(7), int64(2), object(10)
memory usage: 308.4+ MB
