In [None]:
import pandas as pd
import plotly.express as px
import hvplot.pandas

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
#load in beer dataset 
file_path = "../Resources/BeerProject.csv"
beer_df = pd.read_csv(file_path, encoding='unicode_escape')

beer_df.head()

### 1. Process Data For Unsupervised Machine Learning

For data processing, the focus is on making sure the data is set up for the unsupervised learning model, which requires the following:

1. Null values are handled.

2. Only numerical data is used.

2. Values are scaled. In other words, data has been manipulated to ensure that the variance between the numbers won't skew results.

In [None]:
# 1. Assign beer_style column to another variable for later
beer_style = beer_df.beer_style
print(beer_style.shape)
beer_style.head()

In [None]:
# 2. Drop columns we dont need:
beer_df.drop(columns=["beer_beerId", "beer_brewerId", "beer_name", "beer_style", "review_profileName", "review_text", "review_time"], inplace=True)
beer_df.head()


In [None]:
# 3. Drop rows with na values
beer_df = beer_df.dropna()

In [None]:
# 4. Check that all variables are numeric/float
beer_df.dtypes

In [None]:
# 5. Standardise the data
beer_scaled = StandardScaler().fit_transform(beer_df)

### 2. Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
# Initialize PCA model
pca = PCA(n_components=3)
# Get two principal components for the crypto_scaled data.
beer_pca = pca.fit_transform(beer_scaled)

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=beer_pca, columns=["PC 1", "PC 2", "PC 3"], index = beer_df.index)
pcs_df.head()

### 2. Clustering Beer Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.

inertia = []
k = list(range(1, 100))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Use the elbow curve to find the number of clusters to use:
    
    k = x
    n_clusters = x

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
predictions.head()

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([beer_df, pcs_df], axis=1, join='inner')

#  Add a new column, "beer_type" to the clustered_df DataFrame that holds the names of the Beers. 
clustered_df["beer_type"] = crypto_names_df.CoinName

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)


## Visualize the Data

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters 
        # - this may not make sense if we have loads of clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="beer_type",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()