# Craft Beer Data

In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

In [2]:
# getting the beer data from a csv file
df = pd.read_csv('beers.csv')

In [3]:
df.head() # check_yo_head

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [4]:
# checking on the tail of the data
df.tail()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
2405,2405,0.067,45.0,928,Belgorado,Belgian IPA,424,12.0
2406,2406,0.052,,807,Rail Yard Ale,American Amber / Red Ale,424,12.0
2407,2407,0.055,,620,B3K Black Lager,Schwarzbier,424,12.0
2408,2408,0.055,40.0,145,Silverback Pale Ale,American Pale Ale (APA),424,12.0
2409,2409,0.052,,84,Rail Yard Ale (2009),American Amber / Red Ale,424,12.0


In [5]:
# taking a look at the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2410 non-null   int64  
 1   abv         2348 non-null   float64
 2   ibu         1405 non-null   float64
 3   id          2410 non-null   int64  
 4   name        2410 non-null   object 
 5   style       2405 non-null   object 
 6   brewery_id  2410 non-null   int64  
 7   ounces      2410 non-null   float64
dtypes: float64(3), int64(3), object(2)
memory usage: 150.8+ KB


In [6]:
# taking a look at the shape of the data
df.shape

(2410, 8)

Initial Takeaways:

- So it looks like there are 8 columns 

- The unnamed column is a count of the data for both .csv files

- Some of the ibu has data and some don't

- I don't know what the id column is for

In [7]:
# prep

# df.drop(columns = 'Unnamed: 0', )

In [None]:
# reading the breweries csv file
df_1 = pd.read_csv('breweries.csv')

In [None]:
df_1.head() #check_yo_head

In [None]:
# using merge function by setting how='inner'
df = pd.merge(df, df_1, 
                   on='Unnamed: 0', 
                   how='inner')

In [None]:
df.head() #check_yo_head

In [None]:
# checking for nulls in the data
df.isna().sum()

In [None]:
# taking a look at the shape of the merged data
df.shape

In [None]:
# quick info on the merged data
df.info()

In [None]:
# renaming columns for readability
df = df.rename(columns = {"style": "beer_style", "name_y": "brewery", "Unnamed: 0": "number", "name_x": "beer"})

In [None]:
df.head() #check_yo_head

In [None]:
# messing around and dropping a .plot() on the data
df.plot()

In [None]:
# taking a look at the count of the top beer styles
df.beer_style.value_counts().head()

In [None]:
# looking to see if the same brewery has multiple beers
df.brewery.value_counts()

In [None]:
# df.shape had 558 rows
# seeing how many different breweries there are
df.brewery.nunique()

In [None]:
# seeing how many unique beers there are
df.beer.nunique()

In [None]:
# looking at the amount of beer per state
df.state.value_counts()

In [None]:
# trying out a .plot() on the states
df.state.value_counts().plot()

In [None]:
state_df = df.state.value_counts().rename_axis('states').reset_index(name='breweries')
state_df.head(5)

In [None]:
# Visualization

fig, axes = plt.subplots(1,2, sharey=True, figsize=(10,5))
sns.set_theme(style="whitegrid")
fig.suptitle('Top 10 States and Bottom 10 States')

# Top 10 States
sns.barplot(ax=axes[0], x="breweries", y="states", data=state_df.head(10))
axes[0].set_title("Brewery Lovers")

# Bottom 10 States
sns.barplot(ax=axes[1], x="states", y="breweries", data=state_df.tail(10))
axes[1].set_title("Brewery Haters")

In [None]:
# Bottom 10 States
fig, axes = plt.subplots(1, sharey=True, figsize=(10,5))
sns.set_theme(style="whitegrid")

sns.barplot(ax=axes[0], x="states", y="breweries", data=state_df.tail(10))
axes[1].set_title("Brewery Haters")

In [None]:
df.head()

In [None]:
top_beer = df.beer_style.value_counts().rename_axis('beer_type').reset_index(name='total')
top_beer.head()

In [None]:
sns.barplot(x='total', y='beer_type', data=top_beer.head(10))
plt.title('Top Flight Beers')


In [None]:
top10_beers = top_beer.head(10)

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
labels = top10_beers['beer_type']
totals = top10_beers['total']
ax.pie(totals, labels = labels, autopct='%1.2f%%')
plt.title('Percent of Top 10 Beers Consumed')
plt.show()

In [None]:
# making a function

def beer_df():
    '''
    This function takes in the beer data and brewery data 
    and merges into one dataframe 
    and renames the columns for readbility
    and saves the new dataframe as a csv
    '''
    # getting the beers data
    df = pd.read_csv('beers.csv')
    # getting the breweries data
    df_1 = pd.read_csv('breweries.csv')
    # merging the two csv files together
    df = pd.merge(df, df_1, 
                   on='Unnamed: 0', 
                   how='inner')
    # renaming columns for readability
    df = df.rename(columns = {"style": "beer_style", "name_y": "brewery", "Unnamed: 0": "number", "name_x": "beer"})
    # dropping columns that are not needed
    df.drop(['id'], axis=1, inplace=True)
    df.drop(['brewery_id'], axis=1, inplace=True)
    # creating average and mean to fill null values
    avg_abv = 0.05
    ibu_mean = df.ibu.mean()
    # filling null values in the beer data
    df.fillna({'abv' :avg_abv, 'ibu' :ibu_mean}, inplace=True)
    # saving the beer data to a csv file
    df.to_csv('beer_data.csv')
    
    return df

In [None]:
# using my beer_df function
df = beer_df()
df.head() # check_yo_head

In [None]:
# states function

def beer_states():
    '''
    This function will take the count of each state in the data and 
    make a new dataframe with the states and number of breweries
    '''
    df = beer_df()
    
    state_df = df.state.value_counts().rename_axis('states').reset_index(name='breweries')
    
    return state_df

In [None]:
beer_states()

In [None]:
# top beers function

def top_beers():
    '''
    This function will take the count of each beer type in the data and 
    make a new dataframe with the beer type and total number
    '''
    df = beer_df()
    
    top_beer = df.beer_style.value_counts().rename_axis('beer_type').reset_index(name='total')
    
    return top_beer

In [None]:
top_beers().head()

In [None]:
df.head()

In [None]:
# plotting a bar graph for the number of breweries in each state
plot = df.state.value_counts().plot(kind='bar', title="Number of Breweries in Each State", \
                             figsize=(8,6), colormap='winter')
plot.set_xlabel('State')
plot.set_ylabel('Number of Breweries')
mean_line = plot.axhline(df.state.value_counts().mean(), color='r',\
                         label='Average Number of Breweries')
plt.legend()


#plotting a bar graph fro the cities with the most breweries
plot5 = df.groupby('city')['beer_style'].count().nlargest(15).plot(kind='bar', \
               title='Cities with the Most Breweries', \
               colormap='winter',  )
plot5.set_ylabel('Number of Breweries')

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
df.head()

In [None]:
df.drop(['id'], axis=1, inplace=True)
df.drop(['brewery_id'], axis=1, inplace=True)

avg_abv = 0.05
ibu_mean = df.ibu.mean()

df.fillna({'abv' :avg_abv, 'ibu' :ibu_mean}, inplace=True)

In [None]:
df.head()