# Craft Beer Data

In [21]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# getting the beer data from a csv file
df = pd.read_csv('beers.csv')

In [3]:
df.head() # check_yo_head

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [4]:
# checking on the tail of the data
df.tail()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
2405,2405,0.067,45.0,928,Belgorado,Belgian IPA,424,12.0
2406,2406,0.052,,807,Rail Yard Ale,American Amber / Red Ale,424,12.0
2407,2407,0.055,,620,B3K Black Lager,Schwarzbier,424,12.0
2408,2408,0.055,40.0,145,Silverback Pale Ale,American Pale Ale (APA),424,12.0
2409,2409,0.052,,84,Rail Yard Ale (2009),American Amber / Red Ale,424,12.0


In [5]:
# taking a look at the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2410 non-null   int64  
 1   abv         2348 non-null   float64
 2   ibu         1405 non-null   float64
 3   id          2410 non-null   int64  
 4   name        2410 non-null   object 
 5   style       2405 non-null   object 
 6   brewery_id  2410 non-null   int64  
 7   ounces      2410 non-null   float64
dtypes: float64(3), int64(3), object(2)
memory usage: 150.8+ KB


In [6]:
# taking a look at the shape of the data
df.shape

(2410, 8)

Initial Takeaways:

- So it looks like there are 8 columns 

- The unnamed column is a count of the data, probably don't need it

- Some of the ibu has data and some don't

- I don't know what the id column is for

In [None]:
# prep

# df.drop(columns = 'Unnamed: 0', )

In [7]:
df_1 = pd.read_csv('breweries.csv')

In [8]:
df_1.head()

Unnamed: 0.1,Unnamed: 0,name,city,state
0,0,NorthGate Brewing,Minneapolis,MN
1,1,Against the Grain Brewery,Louisville,KY
2,2,Jack's Abby Craft Lagers,Framingham,MA
3,3,Mike Hess Brewing Company,San Diego,CA
4,4,Fort Point Beer Company,San Francisco,CA


In [9]:
# using merge function by setting how='inner'
output1 = pd.merge(df, df_1, 
                   on='Unnamed: 0', 
                   how='inner')

In [10]:
output1.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name_x,style,brewery_id,ounces,name_y,city,state
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0,NorthGate Brewing,Minneapolis,MN
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0,Against the Grain Brewery,Louisville,KY
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0,Jack's Abby Craft Lagers,Framingham,MA
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0,Mike Hess Brewing Company,San Diego,CA
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0,Fort Point Beer Company,San Francisco,CA


In [11]:
df = output1

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name_x,style,brewery_id,ounces,name_y,city,state
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0,NorthGate Brewing,Minneapolis,MN
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0,Against the Grain Brewery,Louisville,KY
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0,Jack's Abby Craft Lagers,Framingham,MA
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0,Mike Hess Brewing Company,San Diego,CA
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0,Fort Point Beer Company,San Francisco,CA


In [17]:
df.isna().sum()

Unnamed: 0      0
abv            16
ibu           272
id              0
name_x          0
style           0
brewery_id      0
ounces          0
name_y          0
city            0
state           0
dtype: int64

In [19]:
df.shape

(558, 11)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 0 to 557
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  558 non-null    int64  
 1   abv         542 non-null    float64
 2   ibu         286 non-null    float64
 3   id          558 non-null    int64  
 4   name_x      558 non-null    object 
 5   style       558 non-null    object 
 6   brewery_id  558 non-null    int64  
 7   ounces      558 non-null    float64
 8   name_y      558 non-null    object 
 9   city        558 non-null    object 
 10  state       558 non-null    object 
dtypes: float64(3), int64(3), object(5)
memory usage: 52.3+ KB


In [22]:
df.describe()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,brewery_id,ounces
count,558.0,542.0,286.0,558.0,558.0,558.0
mean,278.5,0.060825,43.027972,1519.008961,219.494624,13.391398
std,161.224998,0.0137,26.139556,788.197909,169.346041,1.964075
min,0.0,0.032,4.0,9.0,1.0,8.4
25%,139.25,0.05025,22.0,873.75,60.0,12.0
50%,278.5,0.0585,37.0,1568.5,187.0,12.0
75%,417.75,0.068,60.0,2217.75,368.0,16.0
max,557.0,0.125,138.0,2686.0,556.0,24.0


In [23]:
df.style.value_counts().plot.bar()

AttributeError: 'Styler' object has no attribute 'value_counts'