SentenceTransformers: https://sbert.net/

# Install Unsloth
Needs to install Unsloth at the top of a file. Used later

In [None]:
!pip install unsloth[colab-new]

Collecting unsloth[colab-new]
  Downloading unsloth-2025.7.1-py3-none-any.whl.metadata (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.7.1 (from unsloth[colab-new])
  Downloading unsloth_zoo-2025.7.1-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[colab-new])
  Downloading xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth[colab-new])
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tyro (from unsloth[colab-new])
  Downloading tyro-0.9.26-py3-none-any.whl.metadata (12 kB)
Collecting datasets>=3.4.1 (from unsloth[colab-new])
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!

In [None]:
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Check Data Existence


In [None]:
import os
import pandas as pd
import csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
combined_df_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df.csv"

if os.path.exists(combined_df_path):
    print("Loading existing combined_df...\nSkip to Sentence-BERT")
    combined_df = pd.read_csv(combined_df_path)
else:
    print("No combined_df, needs to processing raw data")

Loading existing combined_df...
Skip to Sentence-BERT


# Load Data (Without Reviews)

In [None]:
# get perfume data without reviews

file_path = '/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/fra_standard.csv'
with open(file_path, encoding='latin1') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        print(f"Line {i+1}: {row}")
        if i >= 9:  # print first 10 lines
            break

Line 1: ['', 'url', 'Perfume', 'Brand', 'Rating Count', 'Top', 'Middle', 'Base', 'mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']
Line 2: ['0', 'https://www.fragrantica.com/perfume/xerjoff/accento-overdose-pride-edition-74630.html', 'Accento Overdose Pride Edition', 'Xerjoff', '201', 'fruity notes, aldehydes, green notes', 'bulgarian rose, egyptian jasmine, lily-of-the-valley', 'eucalyptus, pine', 'rose', 'woody', 'fruity', 'aromatic', 'floral']
Line 3: ['1', 'https://www.fragrantica.com/perfume/jean-paul-gaultier/classique-pride-2024-90394.html', 'Classique Pride 2024', 'Jean Paul Gaultier', '70', 'yuzu, citruses', 'orange blossom, neroli', 'musk, blonde woods', 'citrus', 'white floral', 'sweet', 'fresh', 'musky']
Line 4: ['2', 'https://www.fragrantica.com/perfume/jean-paul-gaultier/classique-pride-2023-81775.html', 'Classique Pride 2023', 'Jean Paul Gaultier', '285', 'blood orange, yuzu', 'neroli, orange blossom', 'musk, white woods', 'citrus', 'white floral

In [None]:
df = pd.read_csv(
    file_path,
    sep=',', # specify correct separator
    encoding='latin1',
    low_memory=False
)

In [None]:
df.columns

Index(['Unnamed: 0', 'url', 'Perfume', 'Brand', 'Rating Count', 'Top',
       'Middle', 'Base', 'mainaccord1', 'mainaccord2', 'mainaccord3',
       'mainaccord4', 'mainaccord5'],
      dtype='object')

In [None]:
print(f"Shape: {df.shape}") # (rows, columns)

Shape: (24063, 13)


In [None]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral
1,1,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2024,Jean Paul Gaultier,70,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",citrus,white floral,sweet,fresh,musky
2,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky
3,3,https://www.fragrantica.com/perfume/bruno-bana...,Pride Edition Man,Bruno Banani,59,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",fruity,nutty,woody,tropical,
4,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla
5,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green
6,6,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2024,Jean Paul Gaultier,285,"citruses, yuzu","neroli, orange blossom","woodsy notes, musk",white floral,citrus,fresh,soapy,
7,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic
8,8,https://www.fragrantica.com/perfume/ralph-laur...,Ralph Pride Edition,Ralph Lauren,50,"tangerine, green apple, freesia","magnolia, lime (linden blossom), osmanthus","musk, white iris",citrus,fruity,floral,sweet,green
9,9,https://www.fragrantica.com/perfume/we-pink/wa...,Waffle,We Pink,27,"strawberry, almond, raspberry","butter, peach blossom, violet","praline, vanilla, musk",powdery,musky,violet,fruity,fresh


# Load Data (with Reviews)


In [None]:
# get perfume data with reviews

file_path = '/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/perfumes_table.csv'
with open(file_path, encoding='latin1') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        print(f"Line {i+1}: {row}")
        if i >= 9:  # print first 10 lines
            break

Line 1: ['rating', 'notes', 'designer', 'reviews', 'description', 'url', 'title']
Line 2: ['5.0', "['Vanila', 'Madagascar Vanilla', 'Vanilla Absolute']", 'fiorucci perfumes and colognes', '[]', 'Vanilla Scent by Fiorucci is a Amber Vanilla fragrance for women and men. Vanilla Scent was launched in 1961. Top note is Vanila; middle note is Madagascar Vanilla; base note is Vanilla Absolute.', 'https://www.fragrantica.com/perfume/Fiorucci/Vanilla-Scent-34262.html', 'Vanilla Scent Fiorucci for women and men']
Line 3: ['4.22', "['Mint', 'Lavender', 'Nutmeg', 'elemi', 'Cypress', 'Sandalwood', 'Saffron', 'Cypriol Oil or Nagarmotha', 'Amber', 'Patchouli', 'Precious Woods', 'Musk']", 'maiora parfum perfumes and colognes', '[]', 'Shady by Maiora Parfum is a fragrance for women and men. Shady was launched in 2019. The nose behind this fragrance is Antonio Gigli. Top notes are Mint, Lavender, Nutmeg and elemi; middle notes are Cypress, Sandalwood, Saffron and Cypriol Oil or Nagarmotha; base notes a

In [None]:
df_review = pd.read_csv(
    file_path,
    sep=',',            # specify correct separator
    encoding='latin1',
    low_memory=False
)

In [None]:
df_review.columns

Index(['rating', 'notes', 'designer', 'reviews', 'description', 'url',
       'title'],
      dtype='object')

In [None]:
print(f"Shape: {df_review.shape}")  # (rows, columns)

Shape: (84144, 7)


In [None]:
df_review.head(10)

Unnamed: 0,rating,notes,designer,reviews,description,url,title
0,5.0,"['Vanila', 'Madagascar Vanilla', 'Vanilla Abso...",fiorucci perfumes and colognes,[],Vanilla Scent by Fiorucci is a Amber Vanilla f...,https://www.fragrantica.com/perfume/Fiorucci/V...,Vanilla Scent Fiorucci for women and men
1,4.22,"['Mint', 'Lavender', 'Nutmeg', 'elemi', 'Cypre...",maiora parfum perfumes and colognes,[],Shady by Maiora Parfum is a fragrance for wome...,https://www.fragrantica.com/perfume/Maiora-Par...,Shady Maiora Parfum for women and men
2,3.81,"['Neroli', 'Vetiver', 'Basil', 'Fig']",guerlain perfumes and colognes,"[""the opening is too sharp that it reminds me ...",Nerolia Vetiver by Guerlain is a Woody Floral ...,https://www.fragrantica.com/perfume/Guerlain/N...,Nerolia Vetiver Guerlain for women and men
3,4.0,"['Orange Blossom', 'Star Anise', 'Pear', 'Rose...",jean paul gaultier perfumes and colognes,['I must ask. What part of the country did thi...,Classique Love Actually by Jean Paul Gaultier ...,https://www.fragrantica.com/perfume/Jean-Paul-...,Classique Love Actually Jean Paul Gaultier for...
4,4.22,"['Petitgrain', 'Orange', 'Bergamot', 'Lemon Ve...",santa maria novella perfumes and colognes,['Beautiful and uplifting. Initially fresh and...,Zagara (Orange Blossom) by Santa Maria Novella...,https://www.fragrantica.com/perfume/Santa-Mari...,Zagara (Orange Blossom) Santa Maria Novella fo...
5,3.66,"['Bamboo', 'Violet', 'Water Hyacinth', 'Kiwi',...",oriflame perfumes and colognes,['DO starts airy and fresh proceeding quickly ...,Divine by Oriflame is a Floral fragrance for w...,https://www.fragrantica.com/perfume/Oriflame/D...,Divine Oriflame for women
6,,"['Lemon', 'Cardamom', 'Lily-of-the-Valley', 'A...",jequiti perfumes and colognes,[],Tiago Abravanel Mania by Jequiti is a Woody Ar...,https://www.fragrantica.com/perfume/Jequiti/Ti...,Tiago Abravanel Mania Jequiti for men
7,3.74,"['Cherry', 'Mandarin Orange', 'Datura', 'Ylang...",panouge perfumes and colognes,['Not sure what it is but this smells like a w...,Datura Amaretti by Panouge is a Amber Floral f...,https://www.fragrantica.com/perfume/Panouge/Da...,Datura Amaretti Panouge for women and men
8,2.33,"['Mandarin Orange', 'Freesia', 'Orange Blossom...",franck olivier perfumes and colognes,['Smells like a multisex version of Intimately...,Baby Boy by Franck Olivier is a Citrus fragran...,https://www.fragrantica.com/perfume/Franck-Oli...,Baby Boy Franck Olivier for men
9,4.33,"['Hawthorn', 'Iris', 'Iris', 'Tuberose', 'Tiar...",dâantolâ parfums perfumes and colognes,[],Theatre â Ð¢ÐµÐ°ÑÑ by DâAntolâ Parfums...,https://www.fragrantica.com/perfume/D-Antol-Pa...,Theatre â Ð¢ÐµÐ°ÑÑ DâAntolâ Parfums fo...


# Merge Data

In [None]:
# standardize URLs
df['url'] = df['url'].str.strip().str.lower().str.rstrip('/')
df_review['url'] = df_review['url'].str.strip().str.lower().str.rstrip('/')

# drop NaNs, empty strings, and whitespace-only reviews
df_review = df_review.dropna(subset=['reviews']).copy()
df_review['reviews'] = df_review['reviews'].astype(str).str.strip()
df_review = df_review[df_review['reviews'] != '']
df_review = df_review[df_review['reviews'] != '[]']

# group by URL and join reviews
agg_reviews = df_review.groupby('url')['reviews'].apply(','.join).reset_index()

# merge inner with notes
combined_df = pd.merge(df, agg_reviews, on='url', how='inner')

In [None]:
print(combined_df.columns)
print(combined_df.shape)
combined_df.head()

Index(['Unnamed: 0', 'url', 'Perfume', 'Brand', 'Rating Count', 'Top',
       'Middle', 'Base', 'mainaccord1', 'mainaccord2', 'mainaccord3',
       'mainaccord4', 'mainaccord5', 'reviews'],
      dtype='object')
(21950, 14)


Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but..."


In [None]:
non_null_reviews = combined_df['reviews'].notnull().sum()
print(f"Perfumes with reviews: {non_null_reviews}")

Perfumes with reviews: 21950


In [None]:
print(combined_df['reviews'].iloc[0])

["I thought everybody was hating on this fragrance for the homophobia, turns out its a cash grab where none of the profits go to any lgbtq organisation. don't buy", "Above and beyond super yikes totally gross pink-washing money grab. Next time give 50-100% of profits from the sales to LGBTQ+ orgs, AT THE VERY LEAST, especially since y'all are selling it at that price. Please Xerjoff, spare some change for the gays.", 'Embarrassing pink money cash grab, and during Pride month nonetheless! Xerjoff needs to show us where they\'re donating because simply saying they "support" the LGBTQ community is not enough...', 'FYI: When asked if some LGBTQ+ organization would receive part of earning from this bottle, the PR team of Xerjoff decided to copy paste to everyone a answer without answer, a way to try to save face and also try to confuse you. Gaslighting. But donât get confused, itâs just a money grab.', "The Fragcomm is on fire today because it does appear to me Xerjoff (a high end Itali

In [None]:
# select note columns
note_cols = ['Top', 'Middle', 'Base',
             'mainaccord1', 'mainaccord2', 'mainaccord3',
             'mainaccord4', 'mainaccord5']


In [None]:
combined_df['reviews'] = combined_df['reviews'].str.replace(r'^\[?"|"?\]$', '', regex=True)

In [None]:
# construct a column with a full description so that BERT can understand

def build_full_description(row):
    # so that BERT can understand different fields

    top = row.get('Top', '')
    middle = row.get('Middle', '')
    base = row.get('Base', '')

    accords = ', '.join([
        str(row.get('mainaccord1', '')),
        str(row.get('mainaccord2', '')),
        str(row.get('mainaccord3', '')),
        str(row.get('mainaccord4', '')),
        str(row.get('mainaccord5', ''))
    ])

    reviews = row.get('reviews', '')

    description = (
        f"Top Notes: {top}. "
        f"Middle Notes: {middle}. "
        f"Base Notes: {base}. "
        f"Main Accords: {accords}. "
        f"User Review: {reviews}"
    )

    return description.strip()

combined_df['full_description'] = combined_df.apply(build_full_description, axis=1)

In [None]:
combined_df.columns

Index(['Unnamed: 0', 'url', 'Perfume', 'Brand', 'Rating Count', 'Top',
       'Middle', 'Base', 'mainaccord1', 'mainaccord2', 'mainaccord3',
       'mainaccord4', 'mainaccord5', 'reviews', 'full_description'],
      dtype='object')

In [None]:
# see the change
combined_df[['Top', 'Middle', 'Base',
    'mainaccord1', 'mainaccord2', 'mainaccord3',
    'mainaccord4', 'mainaccord5', 'full_description']].head(5)

Unnamed: 0,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,full_description
0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"Top Notes: fruity notes, aldehydes, green note..."
1,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"Top Notes: blood orange, yuzu. Middle Notes: n..."
2,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"Top Notes: mint, lavender, cardamom, artemisia..."
3,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,"Top Notes: yuzu, blood orange. Middle Notes: n..."
4,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"Top Notes: cranberry, pink grapefruit, citron...."


In [None]:
print(combined_df['full_description'].iloc[0])

Top Notes: fruity notes, aldehydes, green notes. Middle Notes: bulgarian rose, egyptian jasmine, lily-of-the-valley. Base Notes: eucalyptus, pine. Main Accords: rose, woody, fruity, aromatic, floral. User Review: I thought everybody was hating on this fragrance for the homophobia, turns out its a cash grab where none of the profits go to any lgbtq organisation. don't buy", "Above and beyond super yikes totally gross pink-washing money grab. Next time give 50-100% of profits from the sales to LGBTQ+ orgs, AT THE VERY LEAST, especially since y'all are selling it at that price. Please Xerjoff, spare some change for the gays.", 'Embarrassing pink money cash grab, and during Pride month nonetheless! Xerjoff needs to show us where they\'re donating because simply saying they "support" the LGBTQ community is not enough...', 'FYI: When asked if some LGBTQ+ organization would receive part of earning from this bottle, the PR team of Xerjoff decided to copy paste to everyone a answer without answ

In [None]:
# save combined data to avoid reruning next time
combined_df.to_csv("/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df.csv", index=False)

# Sentence-BERT

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 4.1.0
    Uninstalling sentence-transformers-4.1.0:
      Successfully uninstalled sentence-transformers-4.1.0
Successfully installed sentence-transformers-5.0.0


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
# load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# save the scent_embeddings for later use.
# it takes some time to build the scent_embeddings every time

import os
import torch

save_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/scent_embeddings.pt"

if not os.path.exists(save_path):
    scent_lists_with_review = combined_df['full_description'].tolist()
    scent_embeddings = model.encode(
        scent_lists_with_review,
        batch_size=32,
        convert_to_tensor=True,
        show_progress_bar=True
    )
    torch.save(scent_embeddings, save_path)
    print("Scent embeddings saved.")
else:
    scent_embeddings = torch.load(save_path)
    print("Scent embeddings already exist — loading file.")

# torch.save(scent_embeddings, "/content/drive/MyDrive/Colab Notebooks/totallymakescents/scent_embeddings.pt")

Scent embeddings already exist — loading file.


In [None]:
# this is a perfume recommender by BERT itself.
# there will be a version with llm's explanation below in the LLM section

def recommend_perfumes(user_query, top_k=5):
    query_embedding = model.encode(user_query, convert_to_tensor=True)
    similarities = util.cos_sim(query_embedding, scent_embeddings)[0]
    # print(similarities)
    top_results = torch.topk(similarities, k=top_k)

    print(f"\nUser Query: {user_query}\n")
    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item() # convert Python tensor to int
        perfume = combined_df.iloc[idx]
        print(f"{perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f})")
        short_desc = (
        f"Top Notes: {perfume['Top']}. "
        f"Middle Notes: {perfume['Middle']}. "
        f"Base Notes: {perfume['Base']}. "
        f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}." # exclude the review column - too long
        )
        print(f"Notes: {short_desc}\n")

In [None]:
recommend_perfumes("A magical scent in a mystical forest with herbs and secrets")


User Query: A magical scent in a mystical forest with herbs and secrets

Ecstasy by Tiziana Terenzi (Score: 0.662)
Notes: Top Notes: spruce, pine tree, pebbles. Middle Notes: incense, rose, patchouli, violet. Base Notes: soil tincture, amber, woody notes, labdanum, sandalwood, tonka bean. Main Accords: woody, amber, aromatic, balsamic, smoky.

Foreste di Seta by Salvatore Ferragamo (Score: 0.650)
Notes: Top Notes: clary sage. Middle Notes: walnut, silk. Base Notes: madagascar vetiver. Main Accords: aromatic, woody, soft spicy, earthy, nutty.

La Foret Russe by Nimere Parfums (Score: 0.649)
Notes: Top Notes: thyme, sage, herbal notes. Middle Notes: siberian pine, cypress. Base Notes: gurjan balsam, incense, cedar, oakmoss, vetiver, benzoin. Main Accords: woody, aromatic, green, fresh spicy, amber.

Moncler pour Homme by Moncler (Score: 0.644)
Notes: Top Notes: green notes, clary sage. Middle Notes: pine. Base Notes: sandalwood, cedar, vetiver, amber. Main Accords: woody, aromatic, gree

In [None]:
recommend_perfumes("What’s it smell like in the rain, at the end of a hiking trail full of blossoms?")


User Query: What’s it smell like in the rain, at the end of a hiking trail full of blossoms?

Steamed Rainbow by DS&Durga (Score: 0.629)
Notes: Top Notes: blood mandarin, orange, elemi. Middle Notes: grass, almond blossom, cedar. Base Notes: vetiver, violet. Main Accords: citrus, green, woody, aromatic, floral.

French Affair by Ex Nihilo (Score: 0.598)
Notes: Top Notes: litchi, violet leaf, bergamot. Middle Notes: rose oil, atlas cedar, angelica. Base Notes: oakmoss, vetiver, patchouli. Main Accords: woody, earthy, mossy, aromatic, rose.

Monsoon by Dame Perfumery (Score: 0.579)
Notes: Top Notes: orris root, iris flower. Middle Notes: water lily, lily-of-the-valley, jasmine. Base Notes: creosote bush, cedar. Main Accords: iris, woody, powdery, white floral, floral.

La 13Ã¨me Note Femme by Absolument Parfumeur (Score: 0.573)
Notes: Top Notes: sage, pineapple, raspberry, wild strawberry. Middle Notes: mimosa, vanilla, jasmine, violet, rose. Base Notes: white peach, musk, honey, amber.

In [None]:
recommend_perfumes("What fragrance would a wizard wear in a magical world?")


User Query: What fragrance would a wizard wear in a magical world?

LP No. 9 by Penhaligon's (Score: 0.607)
Notes: Top Notes: tarragon, geranium, lavender, bergamot, amalfi lemon. Middle Notes: carnation, rose, jasmine. Base Notes: virginia cedar, cinnamon, amber, patchouli, musk, vanille. Main Accords: aromatic, warm spicy, fresh spicy, floral, woody.

White Diamonds Parfum by Elizabeth Taylor (Score: 0.593)
Notes: Top Notes: aldehydes, white lily, orange, neroli, bergamot. Middle Notes: ylang-ylang, narcissus, tuberose, jasmine, orris root, cinnamon, turkish rose, cloves. Base Notes: sandalwood, amber, oakmoss, musk, patchouli. Main Accords: white floral, woody, yellow floral, warm spicy, citrus.

Baudelaire by Byredo (Score: 0.588)
Notes: Top Notes: juniper berries, pepper, caraway. Middle Notes: leather, incense, hyacinth. Base Notes: patchouli, papyrus, amber. Main Accords: fresh spicy, woody, amber, aromatic, leather.

Eau de Minuit - Midnight Fragrance by Lolita Lempicka (Score

In [None]:
recommend_perfumes("Looking for a bittersweet scent for a farewell party.")


User Query: Looking for a bittersweet scent for a farewell party.

Reverence by Princesse Marina De Bourbon (Score: 0.652)
Notes: Top Notes: bergamot, spicy notes, pepper. Middle Notes: jasmine, plum, fruity notes, rose, tea. Base Notes: musk, sandalwood. Main Accords: fruity, white floral, sweet, citrus, fresh spicy.

Versace Pour Femme by Versace (Score: 0.642)
Notes: Top Notes: lilac, guava, wisteria, black currant, dew drop. Middle Notes: jasmine, lotus, orchid, rhododendron. Base Notes: musk, cashmere wood, bourbon vetiver, atlas cedar. Main Accords: floral, fresh, fruity, white floral, woody.

She by Revlon (Score: 0.638)
Notes: Top Notes: champagne, grapefruit, ginger, mandarin orange. Middle Notes: honeysuckle, rose, cyclamen, cardamom, magnolia. Base Notes: musk, woodsy notes, sandalwood. Main Accords: citrus, warm spicy, champagne, floral, aldehydic.

Tone Indeterminee by Zara (Score: 0.637)
Notes: Top Notes: cinnamon, pomelo. Middle Notes: neroli. Base Notes: tobacco. Main 

# LLM Generates Explanation
Follows this notebook:
https://colab.research.google.com/drive/1T5-zKWM_5OD21QHwXHiV9ixTRR7k3iB9?usp=sharing

## Prepare Unsloth Model

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


## Data Prep

In [None]:
from datasets import load_dataset
# dataset = load_dataset("json", data_files="perfume_reasoning_dataset.jsonl", split="train")

# Load the OpenOrca dataset
# dataset = load_dataset("Open-Orca/OpenOrca", split="train[:100]") # entire dataset is huge (1m rows), Colab's RAM is not enough
dataset = load_dataset("tatsu-lab/alpaca", split="train") # smaller dataset for Colab's RAM

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
dataset.column_names

['instruction', 'input', 'output', 'text']

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    texts = []
    for instr, inp, out in zip(instructions, inputs, outputs):
        user_prompt = f"{instr.strip()}\n{inp.strip()}" if inp else instr.strip()
        formatted = (
            f"<|start_header_id|>user<|end_header_id|>\n\n{user_prompt}<|eot_id|>\n"
            f"<|start_header_id|>assistant<|end_header_id|>\n\n{out.strip()}<|eot_id|>"
        )
        texts.append(formatted)

    return { "text": texts }

In [None]:
# # needs this format when fine-tuning LLaMA3
# def formatting_prompts_func(example):
#     prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{example['input']}<|eot_id|>\n" \
#              f"<|start_header_id|>assistant<|end_header_id|>\n\n{example['output']}<|eot_id|>"
#     return {"text": prompt}

# dataset = dataset.map(formatting_prompts_func)

# Use standardize_sharegpt to convert ShareGPT style datasets into HuggingFace's generic format.

from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
# now the item 5 has the HuggingFace's generic format

dataset[5]["instruction"]

'Identify the odd one out.'

In [None]:
dataset[5]["text"]

'<|start_header_id|>user<|end_header_id|>\n\nIdentify the odd one out.\nTwitter, Instagram, Telegram<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\nTelegram<|eot_id|>'

## Finetune Model with LLama3.2-Unsloth

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
# Only training on assistant responses

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
# Verifying data format

tokenizer.decode(trainer.train_dataset[4]["input_ids"])

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nDescribe a time when you had to make a difficult decision.<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\nI had to make a difficult decision when I was working as a project manager at a construction company. I was in charge of a project that needed to be completed by a certain date in order to meet the client’s expectations. However, due to unexpected delays, we were not able to meet the deadline and so I had to make a difficult decision. I decided to extend the deadline, but I had to stretch the team’s resources even further and increase the budget. Although it was a risky decision, I ultimately decided to go ahead with it to ensure that the project was completed on time and that the client’s expectations were met. The project was eventually successfully completed and this was seen as a testament to my leadership and decision-making abilities.<|eot_id|>'

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[4]["labels"]])

'                       I had to make a difficult decision when I was working as a project manager at a construction company. I was in charge of a project that needed to be completed by a certain date in order to meet the client’s expectations. However, due to unexpected delays, we were not able to meet the deadline and so I had to make a difficult decision. I decided to extend the deadline, but I had to stretch the team’s resources even further and increase the budget. Although it was a risky decision, I ultimately decided to go ahead with it to ensure that the project was completed on time and that the client’s expectations were met. The project was eventually successfully completed and this was seen as a testament to my leadership and decision-making abilities.<|eot_id|>'

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
6.699 GB of memory reserved.


In [None]:
# CHECK LEARNING RATE
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 52,002 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
1,1.9486
2,1.9172
3,1.8083
4,1.9016
5,1.5995
6,1.7156
7,1.5889
8,1.6007
9,1.0442
10,1.5781


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

137.9159 seconds used for training.
2.3 minutes used for training.
Peak reserved memory = 6.699 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 45.445 %.
Peak reserved memory for training % of max memory = 0.0 %.


## Save the model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# save the model for a later use
# it takes a long time to train

FastLanguageModel.for_inference(trainer.model) # fix bitsandbytes errors

save_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/llm-model/"
trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/Colab Notebooks/totallymakescents/llm-model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/totallymakescents/llm-model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/totallymakescents/llm-model/chat_template.jinja',
 '/content/drive/MyDrive/Colab Notebooks/totallymakescents/llm-model/tokenizer.json')

## Run the model

### Model without Clustering

In [None]:
from unsloth.chat_templates import get_chat_template

save_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/llm-model/"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = save_path,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {
#         "role": "user",
#         "content": "User wants a scent for a graduation ceremony. Scent: citrus, violet, ambergris.",
#     },
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
#                          temperature = 1.5, min_p = 0.1)
# tokenizer.batch_decode(outputs)

==((====))==  Unsloth 2025.7.1: Fast Llama patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Unsloth 2025.7.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [None]:
# now, we are ready to use the output from Sentence-BERT to feed into LLM
# for an explanation

# input format for LLM
def format_for_explanation(user_query, perfume_row):
    short_desc = (
        f"Top Notes: {perfume_row['Top']}. "
        f"Middle Notes: {perfume_row['Middle']}. "
        f"Base Notes: {perfume_row['Base']}. "
        f"Main Accords: {', '.join([str(perfume_row.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
    )
    return {
        "role": "user",
        "content": (
            f"User query: {user_query}\n"
            f"Perfume returned: {perfume_row['Perfume']} by {perfume_row['Brand']}\n"
            f"Notes: {short_desc}\n"
            f"Please explain why this perfume fits the request. Keep the explanation short and complete."
        )
    }

# load BERT's model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# load scent embedding from BERT

import torch
scent_embeddings = torch.load("/content/drive/MyDrive/Colab Notebooks/totallymakescents/scent_embeddings.pt")


def recommend_and_explain(user_query, top_k=1):
    query_embedding = sbert_model.encode(user_query, convert_to_tensor=True)
    similarities = util.cos_sim(query_embedding, scent_embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)

    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item()
        perfume = combined_df.iloc[idx]
        message = format_for_explanation(user_query, perfume)

        inputs = tokenizer.apply_chat_template(
            [message],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        # use a TextStreamer for continuous inference
        from transformers import TextStreamer
        text_streamer = TextStreamer(tokenizer, skip_prompt=True)

        print(f"\n--- Explanation for: {perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f}) ---")
        short_desc = (
        f"Top Notes: {perfume['Top']}. "
        f"Middle Notes: {perfume['Middle']}. "
        f"Base Notes: {perfume['Base']}. "
        f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}." # exclude the review column - too long
        )
        print(f"Notes: {short_desc}\n")

        _ = model.generate(
            input_ids=inputs,
            streamer=text_streamer,
            max_new_tokens=128,
            use_cache=True,
            temperature=1.5,
            min_p=0.1,
        )

In [None]:
user_query = "Looking for a bittersweet scent for a farewell party."
recommend_and_explain(user_query, top_k=3)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Explanation for: Reverence by Princesse Marina De Bourbon (Score: 0.652) ---
Notes: Top Notes: bergamot, spicy notes, pepper. Middle Notes: jasmine, plum, fruity notes, rose, tea. Base Notes: musk, sandalwood. Main Accords: fruity, white floral, sweet, citrus, fresh spicy.

Reverence is a bittersweet perfume with notes that combine spices, flowers, fruits, and tea for an earthy scent. The scent is a good fit for a farewell party as it evokes feelings of bittersweet nostalgia.<|eot_id|>

--- Explanation for: Versace Pour Femme by Versace (Score: 0.642) ---
Notes: Top Notes: lilac, guava, wisteria, black currant, dew drop. Middle Notes: jasmine, lotus, orchid, rhododendron. Base Notes: musk, cashmere wood, bourbon vetiver, atlas cedar. Main Accords: floral, fresh, fruity, white floral, woody.

Versace Pour Femme has a floral and fruity aroma with a mix of fresh and woody notes. Its scent can be fitting for a bittersweet occasion, evoking a sense of nostalgia and melancholy while off

In [None]:
user_query = "What’s it smell like in the rain, at the end of a hiking trail full of blossoms?"
recommend_and_explain(user_query, top_k=3)


--- Explanation for: Steamed Rainbow by DS&Durga (Score: 0.629) ---
Notes: Top Notes: blood mandarin, orange, elemi. Middle Notes: grass, almond blossom, cedar. Base Notes: vetiver, violet. Main Accords: citrus, green, woody, aromatic, floral.

This perfume fits the request because it is fresh and green with hints of woodiness and citrus, which should capture the essence of being in the rain after hiking on an overgrown trail filled with blossoms.<|eot_id|>

--- Explanation for: French Affair by Ex Nihilo (Score: 0.598) ---
Notes: Top Notes: litchi, violet leaf, bergamot. Middle Notes: rose oil, atlas cedar, angelica. Base Notes: oakmoss, vetiver, patchouli. Main Accords: woody, earthy, mossy, aromatic, rose.

The perfume fits the request because of its earthy notes and floral top notes, which evoke the sense of blooming and walking in rain in a lush forest.<|eot_id|>

--- Explanation for: Monsoon by Dame Perfumery (Score: 0.579) ---
Notes: Top Notes: orris root, iris flower. Middle N

### Model with Clustering

In [None]:
# now, we are ready to use the output from Sentence-BERT to feed into LLM
# for an explanation

# input format for LLM
def format_for_explanation(user_query, perfume_row):
    short_desc = (
        f"Top Notes: {perfume_row['Top']}. "
        f"Middle Notes: {perfume_row['Middle']}. "
        f"Base Notes: {perfume_row['Base']}. "
        f"Main Accords: {', '.join([str(perfume_row.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
    )
    return {
        "role": "user",
        "content": (
            f"User query: {user_query}\n"
            f"Perfume returned: {perfume_row['Perfume']} by {perfume_row['Brand']}\n"
            f"Notes: {short_desc}\n"
            f"Please explain why this perfume fits the request. Keep the explanation short and complete."
        )
    }

# load BERT's model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# load scent embedding from BERT

import torch
scent_embeddings = torch.load("/content/drive/MyDrive/Colab Notebooks/totallymakescents/scent_embeddings.pt")

def extract_subthemes(user_query):
    message = {
        "role": "user",
        "content": (
            f"User query: \"{user_query}\"\n"
            f"Please list 3-5 short sub-themes or impressions from this query. Keep them very brief."
        )
    }

    input_ids = tokenizer.apply_chat_template(
        [message],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=128,
        temperature=0.7,
        do_sample=True
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    cleaned = response.split("Keep them very brief.")[-1]

    lines = cleaned.replace("\n", ",").split(",")
    subthemes = [line.strip("-•*. ").capitalize() for line in lines if line.strip()]
    subthemes = [s for s in subthemes if s.lower() not in {"assistant", "system", "user"}]

    return subthemes


def recommend_and_explain(user_query, top_k=1):
    query_embedding = sbert_model.encode(user_query, convert_to_tensor=True)
    similarities = util.cos_sim(query_embedding, scent_embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)

    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item()
        perfume = combined_df.iloc[idx]
        message = format_for_explanation(user_query, perfume)

        inputs = tokenizer.apply_chat_template(
            [message],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        # use a TextStreamer for continuous inference
        from transformers import TextStreamer
        text_streamer = TextStreamer(tokenizer, skip_prompt=True)

        print(f"\n--- Explanation for: {perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f}) ---")
        short_desc = (
        f"Top Notes: {perfume['Top']}. "
        f"Middle Notes: {perfume['Middle']}. "
        f"Base Notes: {perfume['Base']}. "
        f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}." # exclude the review column - too long
        )
        print(f"Notes: {short_desc}\n")

        _ = model.generate(
            input_ids=inputs,
            streamer=text_streamer,
            max_new_tokens=128,
            use_cache=True,
            temperature=1.5,
            min_p=0.1,
        )

def recommend_with_subthemes(user_query, top_k=1):
    subthemes = extract_subthemes(user_query)
    print(f"\nSubthemes extracted from query:\n{subthemes}\n")
    for theme in subthemes:
        print(f"\n🔹 Recommendations for subtheme: \"{theme}\"")
        recommend_and_explain(theme, top_k=top_k)


In [None]:
recommend_with_subthemes("A magical scent in a mystical forest with herbs and secrets.", top_k=3)


Subthemes extracted from query:
['A magical scent', 'A mystical forest', 'Herbs', 'And secrets']


🔹 Recommendations for subtheme: "A magical scent"

--- Explanation for: Shamal by Nobile 1942 (Score: 0.633) ---
Notes: Top Notes: incense, apple, dates, aromatic notes. Middle Notes: velvet, amber. Base Notes: musk, woodsy notes. Main Accords: amber, fruity, smoky, musky, sweet.

This perfume has an incense-like scent which provides a magical aroma. The ingredients include aromatic notes, which provide an invigorating scent, along with fruity and sweet accords, and smoky undertones.<|eot_id|>

--- Explanation for: La 13Ã¨me Note Femme by Absolument Parfumeur (Score: 0.633) ---
Notes: Top Notes: sage, pineapple, raspberry, wild strawberry. Middle Notes: mimosa, vanilla, jasmine, violet, rose. Base Notes: white peach, musk, honey, amber. Main Accords: sweet, fruity, powdery, floral, yellow floral.

The La 13ème Note Femme perfume fits the user's request because of its scent descriptions w

### Rank Subthemes

In [None]:
def get_query_embedding(text):
    return sbert_model.encode(text, convert_to_tensor=True)

def recommend_with_ranked_subthemes(user_query, ranked_subthemes, top_k=5):
    """
    Returns perfumes from ranked subthemes
    """

    total = sum(weight for _, weight in ranked_subthemes)
    weighted_embeddings = []

    for subtheme, weight in ranked_subthemes:
        embedding = get_query_embedding(subtheme)
        weighted_embeddings.append((weight / total) * embedding) # Normalize weights

    final_embedding = sum(weighted_embeddings)

    similarities = util.cos_sim(final_embedding, scent_embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)

    print(f"\nUser Query: {user_query}")
    print(f"Ranked subthemes used: {ranked_subthemes}")

    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item()
        perfume = combined_df.iloc[idx]
        print(f"\n🔹 {perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f})")
        short_desc = (
            f"Top Notes: {perfume['Top']}. "
            f"Middle Notes: {perfume['Middle']}. "
            f"Base Notes: {perfume['Base']}. "
            f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
        )
        print(f"Notes: {short_desc}")

In [None]:
user_query = "A magical scent in a mystical forest with herbs and secrets."

ranked_subthemes = [
    ('A magical scent', 3),
    ('A mystical forest', 4),
    ('Herbs', 5),
    ('And secrets', 1)
]

recommend_with_ranked_subthemes(user_query, ranked_subthemes, top_k=5)


User Query: A magical scent in a mystical forest with herbs and secrets.
Ranked subthemes used: [('A magical scent', 3), ('A mystical forest', 4), ('Herbs', 5), ('And secrets', 1)]

🔹 Bitter End by Roads (Score: 0.616)
Notes: Top Notes: grass, mint, fern. Middle Notes: fig leaf, thyme, olive. Base Notes: violet, oakmoss, vetiver. Main Accords: green, aromatic, fresh, fresh spicy, woody.

🔹 Arabian Forest by Alexandria Fragrances (Score: 0.613)
Notes: Top Notes: apple, bergamot, lemon. Middle Notes: birch, cloves, pink pepper, patchouli, angelica, pimento. Base Notes: cedar, vetiver, oakmoss, musk, iris. Main Accords: woody, warm spicy, earthy, musky, powdery.

🔹 La Foret Russe by Nimere Parfums (Score: 0.606)
Notes: Top Notes: thyme, sage, herbal notes. Middle Notes: siberian pine, cypress. Base Notes: gurjan balsam, incense, cedar, oakmoss, vetiver, benzoin. Main Accords: woody, aromatic, green, fresh spicy, amber.

🔹 Et by rnity Night  (Score: 0.598)
Notes: Top Notes: plum, paprika,