In [1]:
# Import necessary libraries

import altair as alt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [7]:
# Task 1: Read and display
file_path = 'data/beers.csv'
beer = pd.read_csv(file_path)

# Remove unnamed columns
beer.drop(columns=[col for col in beer.columns if 'Unnamed' in col], axis=1, inplace=True)

beer

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0.050,,1436,Pub Beer,American Pale Lager,408,12.0
1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,0.090,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,0.075,,2262,Sex and Candy,American IPA,177,12.0
...,...,...,...,...,...,...,...
2405,0.067,45.0,928,Belgorado,Belgian IPA,424,12.0
2406,0.052,,807,Rail Yard Ale,American Amber / Red Ale,424,12.0
2407,0.055,,620,B3K Black Lager,Schwarzbier,424,12.0
2408,0.055,40.0,145,Silverback Pale Ale,American Pale Ale (APA),424,12.0


In [13]:
# Task 2: Visualize
beer_scatter = alt.Chart(beer).mark_circle().encode(
    x = alt.X('ibu', title='International bittering units (IBU)'),
    y = alt.Y('abv', title='Alcoholic content by volume')
).properties(
    title='Scatter plot of IBU vs ABV in Craft Beers'
)
beer_scatter

In [14]:
# Task 3: Handle missing values
clean_beer = beer.dropna(subset=['ibu', 'abv'])
clean_beer

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
21,0.099,92.0,1036,Lower De Boom,American Barleywine,368,8.4
22,0.079,45.0,1024,Fireside Chat,Winter Warmer,368,12.0
24,0.044,42.0,876,Bitter American,American Pale Ale (APA),368,12.0
25,0.049,17.0,802,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,368,12.0
...,...,...,...,...,...,...,...
2398,0.077,30.0,1513,Lights Out Vanilla Cream Extra Stout,American Double / Imperial IPA,199,12.0
2399,0.069,69.0,1512,Worthy IPA (2013),American IPA,199,12.0
2400,0.060,50.0,1511,Worthy Pale,American Pale Ale (APA),199,12.0
2405,0.067,45.0,928,Belgorado,Belgian IPA,424,12.0


In [15]:
# Task 4: Setup scaling
beer_preprocessor = make_column_transformer(
    (StandardScaler(), ['ibu', 'abv']),
    remainder='drop'
)

beer_preprocessor

In [16]:
# Task 5: Create a model
beer_cluster_k2 = KMeans(n_clusters=2, random_state=1234)

beer_cluster_k2

In [17]:
# Task 6: Create a pipeline
beer_pipe = make_pipeline(
    beer_preprocessor,
    beer_cluster_k2
)

beer_pipe.fit(clean_beer)

In [18]:
# Task 7: Create a scatter plot
# Copy clean_beer and assign cluster labels
clustered_beer = clean_beer.copy()
clustered_beer['cluster'] = beer_pipe.named_steps['kmeans'].labels_

# Create scatter plot with clusters
clustered_beer_chart = alt.Chart(clustered_beer).mark_circle().encode(
    x = alt.X('ibu', title='International bittering units (IBU)'),
    y = alt.Y('abv', title='Alcoholic content by volume'),
    color='cluster:N'
).properties(
    title='Clustered Beers by ABV and IBU'
)

clustered_beer_chart

In [19]:
# Task 8: Determine the values of WSSD and K
beer_cluster_k2_wssd = beer_pipe.named_steps['kmeans'].inertia_

beer_cluster_k2_wssd

1110.8925735892112

In [20]:
# Task 8 cont.: Defining range of K values to test
beer_ks = range(1, 11)

In [21]:
# Task 9: Compute the WSSD for each value of K
beer_wssds = [
    make_pipeline(
        beer_preprocessor,
        KMeans(n_clusters=k, random_state=1234)
    ).fit(clean_beer).named_steps['kmeans'].inertia_
    for k in beer_ks
]

beer_wssds

[2810.0,
 1110.8925735892112,
 805.8606054096144,
 613.7620142715004,
 474.7211160857019,
 416.78666146532066,
 361.6228514419578,
 330.42849188443074,
 302.85442259926526,
 269.42104097884794]

In [22]:
# Task 10: Values of K
beer_model_stats = pd.DataFrame({
    'k': list(beer_ks),
    'wssd': beer_wssds
})

beer_model_stats

Unnamed: 0,k,wssd
0,1,2810.0
1,2,1110.892574
2,3,805.860605
3,4,613.762014
4,5,474.721116
5,6,416.786661
6,7,361.622851
7,8,330.428492
8,9,302.854423
9,10,269.421041


In [23]:
# Task 11: Create a plot for the number of clusters
elbow_plot = alt.Chart(beer_model_stats).mark_line(point=True).encode(
    x = alt.X('k', title='Number of Clusters (K)'),
    y = alt.Y('wssd', title='Within-Cluster Sum of Squares (WSSD)')
).properties(
    title='Elbow Method: WSSD vs. Number of Clusters'
)

elbow_plot

In [28]:
# Reflection Questions
print("A: \n From the plot above, which K should we choose?\n")
print("We should choose K = 3\n")

print("B: \n Why did you choose the K value you chose?\n")
print("K = 3 is the 'elbow point' in the WSSD curve, where adding more clusters only slightly reduces WSSD, suggesting that 3 clusters capture the major groupings within the data without the risk of overfitting.\n")

print("C: \n Considering the clusters formed based on the abv and ibu variables, what actionable insights can breweries derive about their craft beer offerings? How might these clusters challenge traditional labeling and marketing strategies, and what limitations should breweries keep in mind when using clustering algorithms to analyze their products?\n")
print("The clusters show distinct groups based on alcohol content and bitterness which can help with positioning the products. Two different clusters could be one for lighter drinkers with a lower alcohol content and less bitter while another grouping could be for heavier beer drinkers and consist of a high alcohol content and higher bitterness. \n This could challenge traditional labeling and marketing strategies by showing similarities across differently labeled beers, maybe a chemical similarity that is causing the bitterness.\n Clustering doesn't account for the individual taste each beer may have or if they are tied to a certain seasonal preference, etc. Breweries should combine clustering with other marketing research before making final decisions when analyzing their products. ")

A: 
 From the plot above, which K should we choose?

We should choose K = 3

B: Why did you choose the K value you chose?

K = 3 is the 'elbow point' in the WSSD curve, where adding more clusters only slightly reduces WSSD, suggesting that 3 clusters capture the major groupings within the data without the risk of overfitting.

C: 
 Considering the clusters formed based on the abv and ibu variables, what actionable insights can breweries derive about their craft beer offerings? How might these clusters challenge traditional labeling and marketing strategies, and what limitations should breweries keep in mind when using clustering algorithms to analyze their products?

The clusters show distinct groups based on alcohol content and bitterness which can help with positioning the products. Two different clusters could be one for lighter drinkers with a lower alcohol content and less bitter while another grouping could be for heavier beer drinkers and consist of a high alcohol content and hi