In [1]:
import csv
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_columns', 70)

import plotly.io as pio
pio.renderers.default = "vscode"


# Load .csv into dataframe

In [5]:
df = pd.read_csv('data/for-analysis.csv',
                delimiter=',',            # Comma as delimiter
                quotechar='"',            # Use double quotes for quoting
                quoting=csv.QUOTE_MINIMAL,  # Minimal quoting (or csv.QUOTE_ALL as needed)
                doublequote=True, 
                encoding='utf-8',
                )

df.describe(include='all')

Unnamed: 0,_source.consumerRating,_score,_source.votes,_source.regularPrice,_source.tastingDescription,_source.availableUnits,_source.certificates,_source.name,_source.style,_source.subCategory.description,_source.alcoholPercentage,_source.sweetness,_source.countryName,lat,lon,country_code02,country_code03
count,5548.0,5548.0,5548.0,5548.0,5548,5548.0,5548,5548,460,5548,5548.0,2898.0,5548,5548.0,5548.0,5548,5548
unique,,,,,4684,,7,5068,26,39,,,65,,,65,65
top,,,,,False,,[],FIREBALL,Lager,France Wine,,,France,,,FR,FRA
freq,,,,,278,,5093,7,157,1277,,,1387,,,1387,1387
mean,3.59398,0.498403,75.3823,216.672383,,1454.248198,,,,,18.977648,0.700138,,39.766035,-26.4821,,
std,1.271495,0.136803,103.675391,1549.303348,,2966.984517,,,,,13.192768,2.081728,,26.440664,70.864044,,
min,0.0,0.200588,0.0,1.99,,0.0,,,,,0.0,0.0,,-41.500083,-107.991707,,
25%,3.4,0.362997,7.0,19.99,,67.0,,,,,12.5,0.0,,39.78373,-100.445882,,
50%,3.9,0.595873,41.0,35.49,,574.0,,,,,13.5,0.0,,46.603354,-3.276575,,
75%,4.3,0.601447,107.0,87.99,,1768.25,,,,,17.0,0.0,,54.702354,1.888334,,


# Removing rows where rating, price, alcohol percentage, country_code03 is nan

In [6]:
df.columns

Index(['_source.consumerRating', '_score', '_source.votes',
       '_source.regularPrice', '_source.tastingDescription',
       '_source.availableUnits', '_source.certificates', '_source.name',
       '_source.style', '_source.subCategory.description',
       '_source.alcoholPercentage', '_source.sweetness', '_source.countryName',
       'lat', 'lon', 'country_code02', 'country_code03'],
      dtype='object')

In [8]:
drop_nan_from_col = ['_source.consumerRating', 
                     '_source.regularPrice', 
                     '_source.alcoholPercentage',
                     'country_code03']

df = df.dropna(subset = drop_nan_from_col)
df.describe(include='all')


Unnamed: 0,_source.consumerRating,_score,_source.votes,_source.regularPrice,_source.tastingDescription,_source.availableUnits,_source.certificates,_source.name,_source.style,_source.subCategory.description,_source.alcoholPercentage,_source.sweetness,_source.countryName,lat,lon,country_code02,country_code03
count,5548.0,5548.0,5548.0,5548.0,5548,5548.0,5548,5548,460,5548,5548.0,2898.0,5548,5548.0,5548.0,5548,5548
unique,,,,,4684,,7,5068,26,39,,,65,,,65,65
top,,,,,False,,[],FIREBALL,Lager,France Wine,,,France,,,FR,FRA
freq,,,,,278,,5093,7,157,1277,,,1387,,,1387,1387
mean,3.59398,0.498403,75.3823,216.672383,,1454.248198,,,,,18.977648,0.700138,,39.766035,-26.4821,,
std,1.271495,0.136803,103.675391,1549.303348,,2966.984517,,,,,13.192768,2.081728,,26.440664,70.864044,,
min,0.0,0.200588,0.0,1.99,,0.0,,,,,0.0,0.0,,-41.500083,-107.991707,,
25%,3.4,0.362997,7.0,19.99,,67.0,,,,,12.5,0.0,,39.78373,-100.445882,,
50%,3.9,0.595873,41.0,35.49,,574.0,,,,,13.5,0.0,,46.603354,-3.276575,,
75%,4.3,0.601447,107.0,87.99,,1768.25,,,,,17.0,0.0,,54.702354,1.888334,,


In [14]:
print(f"Types of beer '_source.style' = {df['_source.style'].unique()}")
print(f"Types of drink '_source.subCategory.description' = {df['_source.subCategory.description'].unique()}")

Types of beer '_source.style' = [nan 'California Common Or Steam' 'India Pale Ale' 'Pale Ale' 'Wheat'
 'Pilsner' 'Lager' 'Dark Lager' 'Other' 'Brown Ale' 'Variety Pack'
 'Strong Ale' 'Porter' 'Stout' 'Belgian Ale' 'Blonde Ale' 'Sour'
 'Gluten-free' 'Bock' 'Amber Ale' 'Cream Ale' 'Altbier' 'Red Ale'
 'Fruit Ale' 'Scotch Ale' 'Bitter' 'Radler']
Types of beer '_source.subCategory.description' = ['Italy Wine' 'France Wine' 'Spain Wine' 'Portugal Wine' 'Fortified Wine'
 'Greece Wine' 'Argentina Wine' 'USA Wine' 'Sake' 'Australia Wine'
 'Canada - Other' 'Canada - BC' 'Other Country Wine' 'Germany Wine'
 'South Africa Wine' 'De-Alcoholized Product' 'Austria Wine' 'Chile Wine'
 'China Wine' 'Israel Wine' 'New Zealand Wine' 'Other Style Wine'
 'Hungary Wine' 'Georgia Wine' 'Bulgaria Wine' 'Domestic - BC Beer'
 'Domestic - Other Province Beer' 'Import Beer' 'Asian Spirits' 'Whisky'
 'Tequila' 'Liqueurs' 'Other Spirits' 'Grape and Fruit Brandy' 'Vodka'
 'Rum' 'Gin' 'Coolers' 'Cider']


## Use clustering to reduce the number of wine types. 

There are soo many wine categories that I will need to reduce the number of categories. 

One 'easy' and obvious way is to cluster based on geographic location. However, wine can be so diverse that this may not be an optimal way of clustering wine. 
But I do not know how many cluster the wine is in so I will use **Hierarchical Clustering** which will 'work out' how many cluster to partition the data into. 
There are two types of **Hierarchical Clustering**:
1. Agglomerative Clustering (Bottom-up): Starts with each data point as its own cluster and merges them step-by-step.
2. Divisive Clustering (Top-down): Starts with one cluster containing all points and recursively splits them.

scikit-learn uses Agglomerative Clustering so I will be using that. 


In [15]:
df_wine = df[df['_source.subCategory.description'].str.contains('wine', case=False, na=False)]

print(f"Types of drink '_source.subCategory.description' = {df_wine['_source.subCategory.description'].unique()}")


Types of drink '_source.subCategory.description' = ['Italy Wine' 'France Wine' 'Spain Wine' 'Portugal Wine' 'Fortified Wine'
 'Greece Wine' 'Argentina Wine' 'USA Wine' 'Australia Wine'
 'Other Country Wine' 'Germany Wine' 'South Africa Wine' 'Austria Wine'
 'Chile Wine' 'China Wine' 'Israel Wine' 'New Zealand Wine'
 'Other Style Wine' 'Hungary Wine' 'Georgia Wine' 'Bulgaria Wine']
