In [4]:
"""
Data set description:
The goal of the project was to create a tasting profile on beer based on word counts for a classification and recommendation system. 
The first ten columns are information on the beer provided by the source, along with contributed information like a unique key for each beer and style. 
The last eleven columns represent the tasting profile features of the beer, and are defined by word counts found in up to 25 reviews of each beer. 
The assumption is that people writing reviews are more than likely describing what they do experience rather than what they do not.
"""

'\nData set description:\nThe goal of the project was to create a tasting profile on beer based on word counts for a classification and recommendation system. \nThe first ten columns are information on the beer provided by the source, along with contributed information like a unique key for each beer and style. \nThe last eleven columns represent the tasting profile features of the beer, and are defined by word counts found in up to 25 reviews of each beer. \nThe assumption is that people writing reviews are more than likely describing what they do experience rather than what they do not.\n'

In [52]:
# load libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import decomposition as dcp
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
# load data
df = pd.read_csv("df_clean.csv", index_col = 0)
brewery_key = pd.read_csv("brewery_key.csv", index_col = 0)
style_key = pd.read_csv("style_key.csv", index_col = 0)
name_key = pd.read_csv("name_key.csv", index_col = 0)

print(len(df))

3360


In [3]:
df.columns

Index(['key', 'Brewery Key', 'Style Key', 'Description', 'ABV', 'Ave Rating',
       'Min IBU', 'Max IBU', 'Astringency_customer', 'Body_customer',
       'Alcohol_customer', 'Bitter_customer', 'Sweet_customer',
       'Sour_customer', 'Salty_customer', 'Fruits_customer', 'Hoppy_customer',
       'Spices_customer', 'Malty_customer', 'Astringency_description',
       'Body_description', 'Alcohol_description', 'Bitter_description',
       'Sweet_description', 'Sour_description', 'Salty_description',
       'Fruits_description', 'Hoppy_description', 'Spices_description',
       'Malty_description'],
      dtype='object')

In [5]:
# inspect data
df.head()

Unnamed: 0,key,Brewery Key,Style Key,Description,ABV,Ave Rating,Min IBU,Max IBU,Astringency_customer,Body_customer,...,Body_description,Alcohol_description,Bitter_description,Sweet_description,Sour_description,Salty_description,Fruits_description,Hoppy_description,Spices_description,Malty_description
0,251,34,8,Richly malty and long on the palate with just ...,5.3,3.65,25,50,13,32,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,252,872,8,This malty full bodied double alt is also know...,7.2,3.9,25,50,12,57,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,253,872,8,Long Trail Ale is full bodied amber ale modele...,5.0,3.58,25,50,14,37,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,255,1019,8,Just cause it dark and German doesn mean it an...,5.3,3.67,25,50,21,69,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
5,256,997,8,Called Dark Double Alt on the label Seize the ...,7.2,3.78,25,50,25,51,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
# check NaN values
print(df.isna().any())

key                        False
Brewery Key                False
Style Key                  False
Description                False
ABV                        False
Ave Rating                 False
Min IBU                    False
Max IBU                    False
Astringency_customer       False
Body_customer              False
Alcohol_customer           False
Bitter_customer            False
Sweet_customer             False
Sour_customer              False
Salty_customer             False
Fruits_customer            False
Hoppy_customer             False
Spices_customer            False
Malty_customer             False
Astringency_description     True
Body_description            True
Alcohol_description         True
Bitter_description          True
Sweet_description           True
Sour_description            True
Salty_description           True
Fruits_description          True
Hoppy_description           True
Spices_description          True
Malty_description           True
dtype: boo

#### Part 1: PCA - grouping companies based on beer style

In [35]:
# drop unnecessary columns
df_pca1 = df.drop(["Description", "Ave Rating", "Style Key", "key",'Astringency_description',
       'Body_description', 'Alcohol_description', 'Bitter_description',
       'Sweet_description', 'Sour_description', 'Salty_description',
       'Fruits_description', 'Hoppy_description', 'Spices_description',
       'Malty_description'], axis = 1).reset_index(drop=True)
df_pca1.head()

Unnamed: 0,Brewery Key,ABV,Min IBU,Max IBU,Astringency_customer,Body_customer,Alcohol_customer,Bitter_customer,Sweet_customer,Sour_customer,Salty_customer,Fruits_customer,Hoppy_customer,Spices_customer,Malty_customer
0,34,5.3,25,50,13,32,9,47,74,33,0,33,57,8,111
1,872,7.2,25,50,12,57,18,33,55,16,0,24,35,12,84
2,872,5.0,25,50,14,37,6,42,43,11,0,10,54,4,62
3,1019,5.3,25,50,21,69,10,63,120,14,0,19,36,15,218
4,997,7.2,25,50,25,51,26,44,45,9,1,11,51,20,95


In [36]:
df_pca1

Unnamed: 0,Brewery Key,ABV,Min IBU,Max IBU,Astringency_customer,Body_customer,Alcohol_customer,Bitter_customer,Sweet_customer,Sour_customer,Salty_customer,Fruits_customer,Hoppy_customer,Spices_customer,Malty_customer
0,34,5.3,25,50,13,32,9,47,74,33,0,33,57,8,111
1,872,7.2,25,50,12,57,18,33,55,16,0,24,35,12,84
2,872,5.0,25,50,14,37,6,42,43,11,0,10,54,4,62
3,1019,5.3,25,50,21,69,10,63,120,14,0,19,36,15,218
4,997,7.2,25,50,25,51,26,44,45,9,1,11,51,20,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3355,755,6.5,35,50,9,72,26,48,104,27,1,43,29,27,96
3356,1118,6.0,35,50,15,31,23,16,54,43,0,54,14,140,58
3357,55,6.9,35,50,22,73,9,58,87,23,0,48,91,44,104
3358,1210,8.0,35,50,6,64,30,57,78,15,1,28,57,23,129


In [37]:
# check NaN values
print(df_pca1.isna().any())

Brewery Key             False
ABV                     False
Min IBU                 False
Max IBU                 False
Astringency_customer    False
Body_customer           False
Alcohol_customer        False
Bitter_customer         False
Sweet_customer          False
Sour_customer           False
Salty_customer          False
Fruits_customer         False
Hoppy_customer          False
Spices_customer         False
Malty_customer          False
dtype: bool


In [38]:
# Features related to tasting profile
tasting_profile_feat = ['Astringency_customer', 'Body_customer',
       'Alcohol_customer', 'Bitter_customer', 'Sweet_customer',
       'Sour_customer', 'Salty_customer', 'Fruits_customer', 'Hoppy_customer',
       'Spices_customer', 'Malty_customer']

In [39]:
# scale review columns
scaler = MinMaxScaler()
# Scale values by row
scaled_profile_feat = pd.DataFrame(scaler.fit_transform(df_pca1[tasting_profile_feat].T).T, columns=tasting_profile_feat)
# Scale values by column
scaled_profile_feat = pd.DataFrame(scaler.fit_transform(scaled_profile_feat), columns=tasting_profile_feat)

df_pca1[tasting_profile_feat] = scaled_profile_feat
df_pca1


Unnamed: 0,Brewery Key,ABV,Min IBU,Max IBU,Astringency_customer,Body_customer,Alcohol_customer,Bitter_customer,Sweet_customer,Sour_customer,Salty_customer,Fruits_customer,Hoppy_customer,Spices_customer,Malty_customer
0,34,5.3,25,50,0.117117,0.288288,0.081081,0.423423,0.666667,0.297297,0.0,0.297297,0.513514,0.072072,1.000000
1,872,7.2,25,50,0.142857,0.678571,0.214286,0.392857,0.654762,0.190476,0.0,0.285714,0.416667,0.142857,1.000000
2,872,5.0,25,50,0.225806,0.596774,0.096774,0.677419,0.693548,0.177419,0.0,0.161290,0.870968,0.064516,1.000000
3,1019,5.3,25,50,0.096330,0.316514,0.045872,0.288991,0.550459,0.064220,0.0,0.087156,0.165138,0.068807,1.000000
4,997,7.2,25,50,0.255319,0.531915,0.265957,0.457447,0.468085,0.085106,0.0,0.106383,0.531915,0.202128,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3355,755,6.5,35,50,0.077670,0.689320,0.242718,0.456311,1.000000,0.252427,0.0,0.407767,0.271845,0.252427,0.922330
3356,1118,6.0,35,50,0.107143,0.221429,0.164286,0.114286,0.385714,0.307143,0.0,0.385714,0.100000,1.000000,0.414286
3357,55,6.9,35,50,0.211538,0.701923,0.086538,0.557692,0.836538,0.221154,0.0,0.461538,0.875000,0.423077,1.000000
3358,1210,8.0,35,50,0.039062,0.492187,0.226562,0.437500,0.601562,0.109375,0.0,0.210937,0.437500,0.171875,1.000000


In [40]:
df_pca1.columns

Index(['Brewery Key', 'ABV', 'Min IBU', 'Max IBU', 'Astringency_customer',
       'Body_customer', 'Alcohol_customer', 'Bitter_customer',
       'Sweet_customer', 'Sour_customer', 'Salty_customer', 'Fruits_customer',
       'Hoppy_customer', 'Spices_customer', 'Malty_customer'],
      dtype='object')

In [42]:
# standardise data for running PCA
features = ['ABV', 'Min IBU', 'Max IBU', 'Astringency_customer',
       'Body_customer', 'Alcohol_customer', 'Bitter_customer',
       'Sweet_customer', 'Sour_customer', 'Salty_customer', 'Fruits_customer',
       'Hoppy_customer', 'Spices_customer', 'Malty_customer']
# Separating out the features
x = df_pca1.loc[:, features].values
# Separating out the target
y = df_pca1.loc[:,['Brewery Key']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [61]:
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pc1', 'pc2', 'pc3', 'pc4'])

In [62]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

[3.57791342 2.5120501  2.30227557 1.08434252]
[0.25548918 0.17937875 0.16439931 0.07742999]
0.6766972289777688


In [63]:
finalDf = pd.concat([principalDf, df_pca1[['Brewery Key']]], axis = 1)
finalDf

Unnamed: 0,pc1,pc2,pc3,pc4,Brewery Key
0,0.335760,-0.655851,-0.134102,0.416318,34
1,1.445154,-0.186764,-0.541448,-0.179001,872
2,1.032439,-2.066163,0.134862,-0.581564,872
3,0.607861,-0.486779,-1.234647,1.160585,1019
4,1.248683,-0.879741,-0.437707,-0.148275,997
...,...,...,...,...,...
3355,1.876987,0.631703,-0.217824,-0.516155,755
3356,-0.236110,1.574992,-0.583572,0.296602,1118
3357,1.501368,-0.894881,0.505560,-1.299195,55
3358,1.875102,-0.083900,-0.279224,0.576196,1210


In [64]:
# clustering analysis

#### Part 2: PCA - grouping beer styles based on notes