# Whiskey Name Normalisation and Clustering

This notebook aims to identify the same whiskey across multiple reviews, regardless of whether the exact same name was used.  In the GenAI parsing I did some name metadata extraction, and we are going to leverage that along with hierarchical clustering to create a unique set of whiskies contained in the review

In [1]:
import json
import os
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import certifi
from sympy.codegen.ast import continue_

uri = f"mongodb+srv://{os.getenv('whiskeydb_admin')}:{os.getenv('whiskeydb_pwd')}@whiskeyrecommender.mvfds.mongodb.net/?retryWrites=true&w=majority&appName=WhiskeyRecommender"
ca = certifi.where()

# Create a new client and connect to the server
client = MongoClient(uri,
    server_api=ServerApi('1'),
    tls=True,
    tlsAllowInvalidCertificates=False,
    tlsCAFile=ca)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [2]:
from thefuzz import fuzz
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import numpy as np

In [3]:
reddit_reviews = client.reddit_reviews
parsed_docs = reddit_reviews['parsed_reviews']

whiskey_docs = [x for x in parsed_docs.find()]

In [4]:
names_to_cluster = set([x['whiskey_name'] for x in whiskey_docs])

In [5]:
len(names_to_cluster)

9849

In [6]:
names_to_cluster

{'Madeira Cask Finish',
 'Stones of Stenness 13 Single Cask Nation',
 'Longmorn 22 1992 TWE Selection',
 'Allt-a-Bhainne 17',
 'Glen Broch 10',
 'Four Grain Straight Bourbon',
 'Speyside 44',
 'A Midwinter Nights Dram Act 4.2',
 'Dalmore 19 1990 Signatory',
 '15 Signatory First Fill Sherry',
 'Benrinnes 14',
 "Nectar D'Or 12",
 'Stroma Liqueur',
 'Single Cask #4668 Peated/Port Pipe',
 "Queen's Own Cask",
 'Port Cask Matured',
 'Small Batch Reserve',
 'Glenburgie 15 Signatory',
 'Dant Bottled In Bond',
 "Dewar's 12",
 'Pike Creek 10 Port Finish',
 'Eddu Gold',
 'Rhetoric 20',
 'Smokey Joe',
 'Port Charlotte Micro Provenance Cask #0005 - Virgin Oak',
 'The Tayne',
 '10th Anniversary (2016) Machir Bay CS',
 '2004 Vintage Bourbon',
 'Balvenie 14 Roasted Malt',
 'Cask Sample',
 'Solist Oh! So Sherried for Canada',
 'Storm',
 'Single Barrel',
 'Dragon Legend',
 'Glen Elgin 16 SMWS 85.42',
 'SMWS 36.11',
 'GlenDronach 14 Virgin Oak',
 'Octomore 4.2 Comus',
 'Cumberland Cask Experimental Cask 

In [7]:
def string_distance(u, v):
    similarity = fuzz.token_sort_ratio(u, v)
    return (100 - similarity)/100 # Convert similarity (0-100) to distance (100-0)

In [8]:
names_array = np.array(list(names_to_cluster)).reshape(-1,1)

# Create a condensed distance matrix using pdist
distance_matrix = pdist(names_array, lambda x,y: string_distance(x[0],y[0]))

In [9]:
linkage_matrix = linkage(distance_matrix, method="ward")

In [10]:
clusters = fcluster(linkage_matrix, t=0.15, criterion="distance")

In [11]:
whiskey_df = pd.DataFrame(list(zip(names_to_cluster, clusters)), columns=["name", "cluster"])

In [12]:
whiskey_df.cluster.value_counts()

cluster
4682    25
3792    15
3432    15
2686    12
3309    12
        ..
7104     1
3464     1
697      1
5054     1
258      1
Name: count, Length: 7163, dtype: int64

In [13]:
whiskey_df.query('cluster == 2487')

Unnamed: 0,name,cluster
5764,1+11,2487


In [14]:
whiskey_df.sort_values(by='name')

Unnamed: 0,name,cluster
9711,.36 Texas Bourbon,878
6607,1 of 3,3445
5764,1+11,2487
8410,10,2515
2615,10 Bit House Saloon,222
...,...,...
8296,barrel 3493 cask strength bourbon,4276
2422,custom blend,1412
9648,ex-Bourbon Oak,895
9789,rī1,2513


In [15]:
plt.figure(figsize=(10, 5))
dendrogram(
    linkage_matrix,
    labels=strings,
    leaf_rotation=45,
    leaf_font_size=12
)
plt.title("Hierarchical Clustering of Strings")
plt.xlabel("String")
plt.ylabel("Distance")
plt.show()

NameError: name 'plt' is not defined

In [16]:
all_review_df = pd.DataFrame(whiskey_docs)

In [17]:
all_review_df

Unnamed: 0,_id,year,distillery,disillery_region,whiskey_country_of_origin,whiskey_name,is_blend,age,whiskey_type,nose_tags,palette_tags,finish_tags,uuid,rating,user
0,678ee1a072bc0c28adb7679a,2012,Unknown,Blend,Thailand,100 Pipers,True,,Blend,"[alcohol, leather, sweetness]","[bland, woody]","[short, alcohol]",1497ef54838b47d7a34f4085720a33da,68,merlinblack
1,678ee1a072bc0c28adb7679b,2012,100 Pipers,Blend,Thailand,100 Pipers,True,,Blend,"[alcohol, leather, sweetness]","[bland, woody]","[short, alcohol]",1497ef54838b47d7a34f4085720a33da,68,merlinblack
2,678ee1a072bc0c28adb7679c,2018,1792,Kentucky,USA,225th Anniversary,False,NAS but the bottle says 'nearly a decade',Bourbon,"[sweet brown sugar, dark cherries, baking spice]","[cherry pie, excellent mouthfeel]","[shorter, hotter, medium length, medium heat]",5bfcb7bf5f454473b2042da21a0055a8,80,scottmotorrad
3,678ee1a072bc0c28adb7679d,2017,Barton,Kentucky,USA,1792 225th Anniversary,False,10,Bourbon,"[acetone, oaky wood, dried herbs, cigar leaf, ...","[wood, herbal flavors, spice, baking spices, o...","[warming, short, wood, vanilla, dried herbs, t...",29ec172373944280ad2c2d0158db2928,80,WildOscar66
4,678ee1a072bc0c28adb7679e,2018,1792,Kentucky,United States,Bottled In Bond,False,NAS but at least 4,Bourbon,"[Rich caramel, oak, hint of heat]","[oak forward, caramel, spice]","[Medium length, sweet, oaky]",bffcc71c3ad64363a899fcac1be0cb46,80,scottmotorrad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31030,678ee72172bc0c28adb7e0d0,1997,Linkwood,Speyside,Scotland,Linkwood 18y c1997 TEM,False,18,Scotch,"[Shy, Sweet grapes, Strawberries ‘n’ cream oat...","[Hershey’s strawberry syrup, Thick, Fruity swe...","[Kind of short, One dimensional]",36c8c8e3ca1d497fad4eeb53a4db6173,84,xreekinghavocx
31031,678ee72172bc0c28adb7e0d1,2017,Cadenhead,Campbeltown,Scotland,Campbeltown Living Cask 29/12/17,True,2 years,Scotch,"[smoke, crisp malt, lemon, grape, honey, dates]","[sweet grapes, savory cocoa-tobacco notes, car...","[pizza crust, caramel]",7d48731dc1bc4c56a39f8ff4d7612a53,90,xreekinghavocx
31032,678ee72272bc0c28adb7e0d2,2019,Barrell,Canada,Canada,Single Barrel Rye 13 Year Canadian (Alberta) 6...,False,13,Rye,"[cool rye spice, butterscotch, honey, orange z...","[hot, very sweet, dry oak, ginger beer, Juicy ...","[warm, sweet, close to cloying]",3005814542914bbf8895a1d8a597d530,83,xreekinghavocx
31033,678ee72272bc0c28adb7e0d3,2019,Bruichladdich,Islay,Scotland,Elements of Islay Br7,False,15,Scotch,"[stewed plums, raisins, coffee, toasted coconut]","[molasses, cocoa powder, coffee, lemon]","[tonic water, cocoa powder]",52bf3e6154f4464685a8a15117a5ad4f,89,xreekinghavocx


In [18]:
all_review_df.shape

(31035, 15)

In [19]:
whiskey_reviews_wclusters = pd.merge(all_review_df, whiskey_df, how='left', left_on='whiskey_name', right_on='name')

In [20]:
whiskey_reviews_wclusters

Unnamed: 0,_id,year,distillery,disillery_region,whiskey_country_of_origin,whiskey_name,is_blend,age,whiskey_type,nose_tags,palette_tags,finish_tags,uuid,rating,user,name,cluster
0,678ee1a072bc0c28adb7679a,2012,Unknown,Blend,Thailand,100 Pipers,True,,Blend,"[alcohol, leather, sweetness]","[bland, woody]","[short, alcohol]",1497ef54838b47d7a34f4085720a33da,68,merlinblack,100 Pipers,715
1,678ee1a072bc0c28adb7679b,2012,100 Pipers,Blend,Thailand,100 Pipers,True,,Blend,"[alcohol, leather, sweetness]","[bland, woody]","[short, alcohol]",1497ef54838b47d7a34f4085720a33da,68,merlinblack,100 Pipers,715
2,678ee1a072bc0c28adb7679c,2018,1792,Kentucky,USA,225th Anniversary,False,NAS but the bottle says 'nearly a decade',Bourbon,"[sweet brown sugar, dark cherries, baking spice]","[cherry pie, excellent mouthfeel]","[shorter, hotter, medium length, medium heat]",5bfcb7bf5f454473b2042da21a0055a8,80,scottmotorrad,225th Anniversary,5230
3,678ee1a072bc0c28adb7679d,2017,Barton,Kentucky,USA,1792 225th Anniversary,False,10,Bourbon,"[acetone, oaky wood, dried herbs, cigar leaf, ...","[wood, herbal flavors, spice, baking spices, o...","[warming, short, wood, vanilla, dried herbs, t...",29ec172373944280ad2c2d0158db2928,80,WildOscar66,1792 225th Anniversary,5231
4,678ee1a072bc0c28adb7679e,2018,1792,Kentucky,United States,Bottled In Bond,False,NAS but at least 4,Bourbon,"[Rich caramel, oak, hint of heat]","[oak forward, caramel, spice]","[Medium length, sweet, oaky]",bffcc71c3ad64363a899fcac1be0cb46,80,scottmotorrad,Bottled In Bond,1322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31030,678ee72172bc0c28adb7e0d0,1997,Linkwood,Speyside,Scotland,Linkwood 18y c1997 TEM,False,18,Scotch,"[Shy, Sweet grapes, Strawberries ‘n’ cream oat...","[Hershey’s strawberry syrup, Thick, Fruity swe...","[Kind of short, One dimensional]",36c8c8e3ca1d497fad4eeb53a4db6173,84,xreekinghavocx,Linkwood 18y c1997 TEM,1933
31031,678ee72172bc0c28adb7e0d1,2017,Cadenhead,Campbeltown,Scotland,Campbeltown Living Cask 29/12/17,True,2 years,Scotch,"[smoke, crisp malt, lemon, grape, honey, dates]","[sweet grapes, savory cocoa-tobacco notes, car...","[pizza crust, caramel]",7d48731dc1bc4c56a39f8ff4d7612a53,90,xreekinghavocx,Campbeltown Living Cask 29/12/17,1519
31032,678ee72272bc0c28adb7e0d2,2019,Barrell,Canada,Canada,Single Barrel Rye 13 Year Canadian (Alberta) 6...,False,13,Rye,"[cool rye spice, butterscotch, honey, orange z...","[hot, very sweet, dry oak, ginger beer, Juicy ...","[warm, sweet, close to cloying]",3005814542914bbf8895a1d8a597d530,83,xreekinghavocx,Single Barrel Rye 13 Year Canadian (Alberta) 6...,4066
31033,678ee72272bc0c28adb7e0d3,2019,Bruichladdich,Islay,Scotland,Elements of Islay Br7,False,15,Scotch,"[stewed plums, raisins, coffee, toasted coconut]","[molasses, cocoa powder, coffee, lemon]","[tonic water, cocoa powder]",52bf3e6154f4464685a8a15117a5ad4f,89,xreekinghavocx,Elements of Islay Br7,1338


In [21]:
whiskey_reviews_wclusters['aged_clusters'] = whiskey_reviews_wclusters.groupby(['age', 'cluster']).ngroup()

In [22]:
whiskey_reviews_wclusters

Unnamed: 0,_id,year,distillery,disillery_region,whiskey_country_of_origin,whiskey_name,is_blend,age,whiskey_type,nose_tags,palette_tags,finish_tags,uuid,rating,user,name,cluster,aged_clusters
0,678ee1a072bc0c28adb7679a,2012,Unknown,Blend,Thailand,100 Pipers,True,,Blend,"[alcohol, leather, sweetness]","[bland, woody]","[short, alcohol]",1497ef54838b47d7a34f4085720a33da,68,merlinblack,100 Pipers,715,8772
1,678ee1a072bc0c28adb7679b,2012,100 Pipers,Blend,Thailand,100 Pipers,True,,Blend,"[alcohol, leather, sweetness]","[bland, woody]","[short, alcohol]",1497ef54838b47d7a34f4085720a33da,68,merlinblack,100 Pipers,715,8772
2,678ee1a072bc0c28adb7679c,2018,1792,Kentucky,USA,225th Anniversary,False,NAS but the bottle says 'nearly a decade',Bourbon,"[sweet brown sugar, dark cherries, baking spice]","[cherry pie, excellent mouthfeel]","[shorter, hotter, medium length, medium heat]",5bfcb7bf5f454473b2042da21a0055a8,80,scottmotorrad,225th Anniversary,5230,11165
3,678ee1a072bc0c28adb7679d,2017,Barton,Kentucky,USA,1792 225th Anniversary,False,10,Bourbon,"[acetone, oaky wood, dried herbs, cigar leaf, ...","[wood, herbal flavors, spice, baking spices, o...","[warming, short, wood, vanilla, dried herbs, t...",29ec172373944280ad2c2d0158db2928,80,WildOscar66,1792 225th Anniversary,5231,528
4,678ee1a072bc0c28adb7679e,2018,1792,Kentucky,United States,Bottled In Bond,False,NAS but at least 4,Bourbon,"[Rich caramel, oak, hint of heat]","[oak forward, caramel, spice]","[Medium length, sweet, oaky]",bffcc71c3ad64363a899fcac1be0cb46,80,scottmotorrad,Bottled In Bond,1322,11162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31030,678ee72172bc0c28adb7e0d0,1997,Linkwood,Speyside,Scotland,Linkwood 18y c1997 TEM,False,18,Scotch,"[Shy, Sweet grapes, Strawberries ‘n’ cream oat...","[Hershey’s strawberry syrup, Thick, Fruity swe...","[Kind of short, One dimensional]",36c8c8e3ca1d497fad4eeb53a4db6173,84,xreekinghavocx,Linkwood 18y c1997 TEM,1933,3583
31031,678ee72172bc0c28adb7e0d1,2017,Cadenhead,Campbeltown,Scotland,Campbeltown Living Cask 29/12/17,True,2 years,Scotch,"[smoke, crisp malt, lemon, grape, honey, dates]","[sweet grapes, savory cocoa-tobacco notes, car...","[pizza crust, caramel]",7d48731dc1bc4c56a39f8ff4d7612a53,90,xreekinghavocx,Campbeltown Living Cask 29/12/17,1519,4265
31032,678ee72272bc0c28adb7e0d2,2019,Barrell,Canada,Canada,Single Barrel Rye 13 Year Canadian (Alberta) 6...,False,13,Rye,"[cool rye spice, butterscotch, honey, orange z...","[hot, very sweet, dry oak, ginger beer, Juicy ...","[warm, sweet, close to cloying]",3005814542914bbf8895a1d8a597d530,83,xreekinghavocx,Single Barrel Rye 13 Year Canadian (Alberta) 6...,4066,1980
31033,678ee72272bc0c28adb7e0d3,2019,Bruichladdich,Islay,Scotland,Elements of Islay Br7,False,15,Scotch,"[stewed plums, raisins, coffee, toasted coconut]","[molasses, cocoa powder, coffee, lemon]","[tonic water, cocoa powder]",52bf3e6154f4464685a8a15117a5ad4f,89,xreekinghavocx,Elements of Islay Br7,1338,2498


In [23]:
whiskey_reviews_wclusters.aged_clusters.value_counts()

aged_clusters
1562     171
3023     171
146      144
1336     138
305      130
        ... 
3363       1
10298      1
509        1
5164       1
3513       1
Name: count, Length: 12465, dtype: int64

In [37]:
whiskey_reviews_wclusters.query('aged_clusters == 8950')

Unnamed: 0,_id,year,distillery,disillery_region,whiskey_country_of_origin,whiskey_name,is_blend,age,whiskey_type,nose_tags,palette_tags,finish_tags,uuid,rating,user,name,cluster,aged_clusters
19256,678ee50872bc0c28adb7b2d2,2013,Johnnie Walker,Scotland,Scotland,Red Label,True,,Scotch,"[Caramel, Burnt marshmallows, Wheat, Barley, E...","[Toffee, Slight honey, Earthy, Vanilla, Woody,...","[Quick finish, Little sweetness, Very smooth, ...",3e661e5bafbc43faa97809a79db4c90d,86,luckyaussiebob,Red Label,1187,8950
19257,678ee50872bc0c28adb7b2d3,2017,Johnnie Walker,Scotland,Scotland,Red Label,True,,Scotch,[],[],[],3f2a343bdc81401483ecbd37b3172519,52,SPG2469,Red Label,1187,8950
19258,678ee50872bc0c28adb7b2d4,2018,Johnnie Walker,Blend,Scotland,Red Label,True,,Scotch,"[Caramel, faint smoke, sea breeze, plastic]","[Watery, caramel, light smoke, plastic]","[Sweet, drying, bitter, solvent aftertaste]",2d6a68ecc22c45fba454c4ef010af729,65,unbreakablesausage,Red Label,1187,8950
19259,678ee50872bc0c28adb7b2d5,2013,Johnnie Walker,Blend,Scotland,Red Label,True,,Scotch,"[Light fruit, grainy, rubbing alcohol, sweet h...","[Vanilla, green fruits, woody, soapy, astringent]","[Vanilla, grain]",e3be8cda7727415fada3476ad1c36bba,50,airpower47,Red Label,1187,8950
19260,678ee50872bc0c28adb7b2d6,2013,Johnnie Walker,Scotland,Scotland,Red Label,True,,Blend,"[Denatured alcohol, unleaded gasoline, Concent...","[Grass, cheap vodka, Chemicals, corn, shitty m...","[Dry, bitter, molesting]",ce3c23053a0a43e896871bfd0a104add,10,Allumina,Red Label,1187,8950
19261,678ee50872bc0c28adb7b2d7,2014,Johnnie Walker,Scotland,Scotland,Red Label,True,,Blend,"[Weak, Light caramel sweetness, apples, alcohol]","[Sweet cream, Bitter grain alcohol, corn]","[Alcohol burn, smoke, not much else]",7c6c8d98aa3846219dd420edf0955060,50,Allurex,Red Label,1187,8950
19262,678ee50872bc0c28adb7b2d8,2014,Johnnie Walker,Scotland,Scotland,Red Label,True,,Scotch,"[Acetone, Nail Polish, Honey, Peanut Butter]","[Nail Polish, Vanilla, Watery]","[Harsh, Drying, Nail Polish, Funky]",406d2a4464d14bfe962caa47bbb3f860,35,ArdbegYourPardon,Red Label,1187,8950
19263,678ee50872bc0c28adb7b2d9,2013,Johnnie Walker,Blend,Scotland,Red Label,True,,Scotch,"[Light fruit, grainy, rubbing alcohol, sweet h...","[Vanilla, green fruits, woody, soapy, astringent]","[Vanilla, grain]",375dec20d38e415ab0fadcea6419cc92,45,ayedfy,Red Label,1187,8950
19264,678ee50872bc0c28adb7b2da,2014,Johnnie Walker,Blend,Scotland,Red Label,True,,Scotch,"[Peat, Smoke, Diesel, Artificial sweetener, Mo...","[Alcohol, Rough, Moldy]",[Sickly sweet cardboard],30b0a9fb74f348cfb7167717bd807506,40,barbecue_scotch,Red Label,1187,8950
19265,678ee50872bc0c28adb7b2db,2016,Johnnie Walker,Blend,Scotland,Red Label,True,,Scotch,[],[],[],ddcc8c41a58e4b98a17138fdc2a3d7df,65,bbeng89,Red Label,1187,8950


In [25]:
whiskey_reviews_wclusters.aged_clusters.nunique()

12465

In [38]:
tasting_notes = whiskey_reviews_wclusters.query('aged_clusters == 8950')['nose_tags']

In [39]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from collections import Counter
from scipy.spatial.distance import cdist


In [106]:
def get_representative_word(cluster_words, model):
    """Selects the most central word in a cluster based on cosine similarity."""
    words, _ = zip(*cluster_words)
    word_embeddings = np.array([model.encode(word) for word in words])
    centroid = np.mean(word_embeddings, axis=0)
    distances = cdist([centroid], word_embeddings, metric="cosine")[0]
    return words[np.argmin(distances)]  # Select the word closest to centroid


def generate_tasting_themes(tasting_notes=None, threshold=0.45, model=None):
    '''
    Function takes a list of tasting note keywords, embeds them together, clusters and aggregates, and selects up to the 10 most frequent terms.
    '''

    #print('Running Code')
    all_keywords = []
    for note in tasting_notes:
        all_keywords.extend(note)

    # Remove duplicates and keep unique keywords
    unique_keywords = list(set(all_keywords))

    if len(unique_keywords) == 0:
        return []
    elif len(unique_keywords) <= 10:
        return unique_keywords
    else:
        #Create Embeddings
        embeddings = model.encode(unique_keywords)

        # DBSCAN
        dbscan = DBSCAN(eps=threshold, min_samples=2, metric="cosine")  # Adjust `eps` as needed
        labels = dbscan.fit_predict(embeddings)

        cluster_map = {unique_keywords[i]: labels[i] for i in range(len(unique_keywords))}
        keyword_counts = Counter(all_keywords)
        # Group keywords by clusters
        cluster_groups = {}
        defaults = []
        # Need to figure out what happens here if DBSCAN doesn't have clusters
        for keyword, cluster in cluster_map.items():
            if cluster == -1:  # Ignore noise points
                defaults.append(keyword)
            if cluster not in cluster_groups:
                cluster_groups[cluster] = []
            cluster_groups[cluster].append((keyword, keyword_counts[keyword]))

        # Step 5: Select Top 10 Densest Clusters
        sorted_clusters = sorted(cluster_groups.items(), key=lambda x: len(x[1]), reverse=True)[:10]

        top_themes = [get_representative_word(words, model) for _, words in sorted_clusters]

        if (len(top_themes) < 10) and (len(defaults) > 0):
            return top_themes + defaults[0:(11 - len(top_themes))]
        return top_themes



In [73]:
embedding_model=SentenceTransformer('all-MiniLM-L6-v2')

nose_notes = whiskey_reviews_wclusters.query('aged_clusters == 8950')['nose_tags']
palette_notes = whiskey_reviews_wclusters.query('aged_clusters == 8950')['palette_tags']
finish_notes = whiskey_reviews_wclusters.query('aged_clusters == 8950')['finish_tags']

In [74]:
nose_themes = generate_tasting_themes(nose_notes, threshold=0.45, model=embedding_model)
print(nose_themes)

Running Code
['sweetness', 'alcohol', 'grass', 'Spice cake', 'vanilla', 'Earthy', 'bubblegum', 'Peat', 'plastic', 'acetone']


In [75]:
palette_themes = generate_tasting_themes(palette_notes, threshold=0.45, model=embedding_model)
print(palette_themes)

Running Code
['sweet grain', 'Wood', 'white pepper', 'peat smoke', 'Grass', 'licorice', 'woody', 'Chemicals', 'Good steak', 'vanilla']


In [77]:
finish_themes = generate_tasting_themes(finish_notes, threshold=0.45, model=embedding_model)
print(finish_themes)

Running Code
['sweetness', 'Alcohol burn', 'Spice', 'lingering bitterness', 'Drying', 'Oily but very slick', 'Smoke', 'Sickly sweet cardboard', 'Medicinal', 'Slightly metallic']


### Now We Need to Apply This to Every Whiskey

In [None]:
from pydantic import BaseModel, Field
whiskeys_by_id = {}

### Pydantic Object
class WhiskeyBase(BaseModel):
    whiskey_name: str = Field(..., description="The name of the whiskey without distiller or age")
    nose_tags: list[str] = Field(..., description='Keywords describing the nose/smell of the whiskey from the provided review')
    palette_tags: list[str] = Field(..., description='Keywords describing the nose/smell of the whiskey from the provided review')
    finish_tags: list[str] = Field(..., description="Keywords describing the finish of the whiskey")

for idx, grp in whiskey_reviews_wclusters.head(1000).groupby('aged_clusters'):

    nose_themes = generate_tasting_themes(grp.nose_tags, threshold=0.45, model=embedding_model)
    palette_themes = generate_tasting_themes(grp.palette_tags, threshold=0.45, model=embedding_model)
    finish_themes = generate_tasting_themes(grp.finish_tags, threshold=0.45, model=embedding_model)

    whiskey = WhiskeyBase(whiskey_name=str(idx), nose_tags=nose_themes, palette_tags=palette_themes, finish_tags=finish_themes)

    whiskeys_by_id[idx] = whiskey

In [108]:
pd.DataFrame(whiskeys_by_id).T

Unnamed: 0,0,1,2,3
121,"(whiskey_name, 121)","(nose_tags, [Oak, Caramel, Brown Sugar, Strong...","(palette_tags, [Thick, black pepper, hint of b...","(finish_tags, [Long, black pepper, Rye spice, ..."
528,"(whiskey_name, 528)","(nose_tags, [dried herbs, cigar leaf, oaky woo...","(palette_tags, [herbal flavors, oaky vanilla, ...","(finish_tags, [dried herbs, tobacco, warming, ..."
813,"(whiskey_name, 813)","(nose_tags, [rye spice, vanilla, berries, hone...","(palette_tags, [burnt sugar, vanilla, thick, o...","(finish_tags, [rye spice, oak, creamy vanilla,..."
4242,"(whiskey_name, 4242)","(nose_tags, [muted, orange peel, raspberries, ...","(palette_tags, [black pepper, tannic, mild ent...","(finish_tags, [tart raspberries, mint, black p..."
4278,"(whiskey_name, 4278)","(nose_tags, [Red grapes, Brioche, Strawberry j...","(palette_tags, [Black tea, Berry flavors, Swee...","(finish_tags, [Artificial strawberry, A little..."
6669,"(whiskey_name, 6669)","(nose_tags, [orange peel, brown sugar, toffee,...","(palette_tags, [cloves, spicy, young oak, toba...","(finish_tags, [rye notes, rich mouth-feel, int..."
6671,"(whiskey_name, 6671)","(nose_tags, [maple syrup, perfume notes, whipp...","(palette_tags, [bitter, floral, apple juice, r...","(finish_tags, [young spirit, Bitter, rye forwa..."
6682,"(whiskey_name, 6682)","(nose_tags, [Acetone, Bananas])","(palette_tags, [Coffee, Acetone, Oak, Caramel,...","(finish_tags, [Bitter, Harsh, Thin body, Unple..."
6697,"(whiskey_name, 6697)","(nose_tags, [Cherries, acetone, herbal, vanill...","(palette_tags, [heavy toffee, granny smith app...","(finish_tags, [red-wine notes, Tannic, sour])"
7185,"(whiskey_name, 7185)","(nose_tags, [cherry, grape hard candies, maple...","(palette_tags, [moderately oily, tannin filled...","(finish_tags, [orchard fruits, burnt spearmint..."


In [100]:
#whiskey_reviews_wclusters