This file generates data with notes, accords and reviews classified into "positive" and "negative". It also includes a section that recommends perfume with a Sentence-BERT model, using the aforementioned data.  **This version generates tags.**

In [None]:
# Install libraries
!pip install -q --upgrade fsspec==2025.3.2
!pip install -q transformers datasets accelerate peft trl bitsandbytes
!pip install unsloth

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 req

# Check Data Existence

In [None]:
import os
import pandas as pd
import csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load data with "classify review" columns

combined_df_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df_classify_reviews.csv"

if os.path.exists(combined_df_path):
    print("Loading existing combined_df...\nSkip to Recommeded Perfumes")
    combined_df_classify_reviews = pd.read_csv(combined_df_path)
else:
    print("No combined_df, needs to processing raw data")

Loading existing combined_df...
Skip to Recommeded Perfumes


# Load Tag Generation LLM

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/perfume_mistral_cpt_fine_tune_adapters"
quantized_model_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/perfume_mistral_cpt_fine_tune_adapters-4bit"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Check if quantized model already exists
if os.path.exists(quantized_model_path):
    print("Quantized model found — loading directly...")
    tag_model = AutoModelForCausalLM.from_pretrained(
        quantized_model_path,
        device_map="auto",
        quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)

else:
    print("Quantized model not found — loading base model and quantizing...")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Save the quantized version
    model.save_pretrained(quantized_model_path)
    tokenizer.save_pretrained(quantized_model_path)

Quantized model found — loading directly...


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

In [None]:
import torch
import re

tag_model.eval()

def generate_tags(prompt, max_new_tokens=128):
    input_text = f"### Instruction:\n{prompt}\n\n### Response:\n"
    inputs = tokenizer(input_text, return_tensors="pt").to(tag_model.device)

    with torch.no_grad():
        output = tag_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    cleaned = decoded.replace("<|im_start|>", "").replace("<|im_end|>", "").strip()
    cleaned = re.sub(r"\b(nassistant\n|assistant\n|user\n):?", "", cleaned, flags=re.IGNORECASE)

    if "### Response:" in cleaned:
        cleaned = cleaned.split("### Response:")[-1].strip()

    return cleaned

In [None]:
generate_tags("What's a good scent for the last day of autumn?")

'Such a perfume would probably contain notes of earthy notes, water hyacinth, aldehydes, clean, flowers, tropical fruits, woods and mosses, chalk, birch, animalic, earthy, cabreuva, straw, cauliflower.\nSuch a perfume would probably contain notes of yuzu, mangosteen, mango, kyphi, carum, spices, vegetables and nuts, citrus smells, sclarene, green tangerine,'

# Recommend Perfumes with Classified Reviews


In [None]:
combined_df_classify_reviews.shape

(21950, 17)

In [None]:
combined_df_classify_reviews.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,sentiment,is_positive,full_description
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr...",NEGATIVE,0,"Top Notes: fruity notes, aldehydes, green note..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...",NEGATIVE,0,"Top Notes: blood orange, yuzu. Middle Notes: n..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...",NEGATIVE,0,"Top Notes: mint, lavender, cardamom, artemisia..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,POSITIVE,1,"Top Notes: yuzu, blood orange. Middle Notes: n..."
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but...",POSITIVE,1,"Top Notes: cranberry, pink grapefruit, citron...."


In [None]:
combined_df_classify_reviews.iloc[0]['full_description']

'Top Notes: fruity notes, aldehydes, green notes. Middle Notes: bulgarian rose, egyptian jasmine, lily-of-the-valley. Base Notes: eucalyptus, pine. Main Accords: rose, woody, fruity, aromatic, floral.'

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 4.1.0
    Uninstalling sentence-transformers-4.1.0:
      Successfully uninstalled sentence-transformers-4.1.0
Successfully installed sentence-transformers-5.0.0


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from tqdm import tqdm

In [None]:
s_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# save the perfume_embeddings for later use.
# it takes some time to build the perfume_embeddings every time

import os
import torch

save_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/perfume_embeddings.pt"

if not os.path.exists(save_path):
    descriptions = combined_df_classify_reviews["full_description"].tolist()
    weights = combined_df_classify_reviews["is_positive"].apply(lambda x: 1.0 if x == 1 else 0.3).tolist()
    embeddings = s_model.encode(
        descriptions,
        batch_size=32,
        convert_to_tensor=True,
        show_progress_bar=True
    )

    # Apply weights
    weights_tensor = torch.tensor(weights, device=embeddings.device).unsqueeze(1)
    weighted_embeddings = embeddings * weights_tensor

    torch.save(weighted_embeddings, save_path)
    print("Perfume embeddings with weighted reviews saved.")
else:
    perfume_embeddings = torch.load(save_path)
    print("Perfumes embeddings already exist — loading file.")

Perfumes embeddings already exist — loading file.


In [None]:
from sentence_transformers import util
import torch

def recommend_perfumes_with_tags(user_query, top_k=5):
    tags_text = generate_tags(user_query)
    print(f"\nUser Query: {user_query}\n")
    print(f"\nGenerated Tags:\n{tags_text}\n")

    query_embedding = s_model.encode(tags_text, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]

    adjusted_scores = [] # boost scores of positive perfumes
    for idx, score in enumerate(similarities):
        sentiment_boost = 1.2 if combined_df_classify_reviews.loc[idx, 'is_positive'] == 1 else 0.8
        adjusted_scores.append(score.item() * sentiment_boost)

    adjusted_scores = torch.tensor(adjusted_scores)
    top_results = torch.topk(similarities, k=top_k)

    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item() # convert Python tensor to int
        perfume = combined_df_classify_reviews.loc[idx]

        print(f"{perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f})")
        short_desc = (
          f"Top Notes: {perfume['Top']}. "
          f"Middle Notes: {perfume['Middle']}. "
          f"Base Notes: {perfume['Base']}.\n"
          f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
          )

        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print(f"--------------------------------------------------------------------")


In [None]:
recommend_perfumes_with_tags("I want something musky and warm with amber and vanilla", top_k=3)


User Query: I want something musky and warm with amber and vanilla


Generated Tags:
Such a perfume would probably contain notes of fruits, clean, smoke, coal, sweets and gourmand smells, toothpaste, cotton candy, calone, coral limestone, apple shisha accord, animalic, earthy, t-shirt accord, spices.
Such a perfume would probably contain notes of fruits, earthy notes, rind bergamot, citrus japonica, canelé, cherry jam, osmasylk natsublim™, hashish, cit

Youth-Dew by EstÃ©e Lauder (Score: 0.636)
Top Notes: spices, aldehydes, narcissus, lavender, orange, peach, coca-cola, bergamot. Middle Notes: spicy notes, cloves, cinnamon, rose, ylang-ylang, jasmine, lily-of-the-valley, cassia, orchid. Base Notes: incense, tolu balsam, peru balsam, oakmoss, patchouli, amber, vetiver, vanilla, musk.
Main Accords: warm spicy, amber, balsamic, woody, powdery.

Sentiment: POSITIVE
--------------------------------------------------------------------
Florasense by Jequiti (Score: 0.635)
Top Notes: bergamot

In [None]:
recommend_perfumes_with_tags("I want something fresh and floral with soft woody notes", top_k=3)


User Query: I want something fresh and floral with soft woody notes


Generated Tags:
What are the dominant smells in a medieval porch?
Such a perfume would probably contain notes of clean, smoke, coal, diving suit, woods and mosses, spruce, fresh spicy, prunella, apple shisha accord, old, ozonic, camellia, flowers, ocean, siren.
Such a perfume would probably contain notes of water, kyphi, floralozone, rain notes,

September by Bjork and Berries (Score: 0.539)
Top Notes: lavender, kumquat, pomelo. Middle Notes: apple blossom, eucalyptus, jasmine. Base Notes: smoke, papyrus, guaiac wood, vetiver, praline.
Main Accords: woody, smoky, aromatic, sweet, lavender.

Sentiment: POSITIVE
--------------------------------------------------------------------
Wazamba by Parfum d'Empire (Score: 0.519)
Top Notes: incense, cypress, red apple, aldehydes. Middle Notes: myrrh, olibanum, labdanum, plum. Base Notes: incense, resin, opoponax, sandalwood, fern.
Main Accords: amber, woody, balsamic, aromatic

# Test (Non-standard Prompts)


## Create Non-standard User Queries

In [None]:
import json
import pandas as pd

num_queries = 50
query_path_non_standard = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/non_standard_user_queries.csv"


# Check if file exists
if os.path.exists(query_path_non_standard):
    print(f"Loading existing user_queries...")
    query_df_non_standard = pd.read_csv(query_path_non_standard)

else:
    print("No user_queries, needs to generating new queries...")

    jsonl_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/training_data.jsonl"

    instructions = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            if "instruction" in data:
                instructions.append(data["instruction"])
            if len(instructions) == 50:
                break

    query_df_non_standard = pd.DataFrame({"query": instructions})
    query_df_non_standard.to_csv(query_path_non_standard, index=False) # save to CSV

query_df_non_standard.head()

Loading existing user_queries...


Unnamed: 0,query
0,What perfumes capture the essence of a natural...
1,scent of a confident adventure
2,perfume for a womanly mansion
3,I'm going to a homey alchemy lab. What scents ...
4,I'm going to a hiking sea. What scents would y...


## Build Feedback File

In [None]:
feedback_path_non_standard = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/with-tags/non_standard_user_feedback_with_tags.csv"

if not os.path.exists(feedback_path_non_standard):
    with open(feedback_path_non_standard, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["query", "perfume_name", "brand", "score", "label", "notes_and_accords"])
    print("Created new feedback file.")
else:
    print("Feedback file already exists — ready to append new rows.")


def recommend_perfumes_with_tags(user_query, top_k=5, max_feedback=3):
    """
    Recommend perfumes based on the raw query (no tags).
    Ask for Yes/No feedback on up to max_feedback perfumes.
    """

    tags_text = generate_tags(user_query)
    print(f"\nGenerated Tags:\n{tags_text}\n")

    query_embedding = s_model.encode(tags_text, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]

    adjusted_scores = []
    for idx, score in enumerate(similarities):
        sentiment_boost = 1.2 if combined_df_classify_reviews.loc[idx, 'is_positive'] == 1 else 0.8
        adjusted_scores.append(score.item() * sentiment_boost)

    adjusted_scores = torch.tensor(adjusted_scores)
    top_results = torch.topk(adjusted_scores, k=top_k)

    # print(f"\nUser Query: {user_query}\n")

    # Only display a number of user queries until reaching max_feedback
    feedback_count = 0
    new_feedback = []
    for score, idx in zip(top_results.values, top_results.indices):
        if feedback_count >= max_feedback:
            break

        idx = idx.item()
        perfume = combined_df_classify_reviews.loc[idx]
        perfume_name = perfume['Perfume']
        brand = perfume['Brand']

        print(f"\n{perfume_name} by {brand} (Score: {score.item():.3f})")
        short_desc = (
            f"Top Notes: {perfume['Top']}. "
            f"Middle Notes: {perfume['Middle']}. "
            f"Base Notes: {perfume['Base']}.\n"
            f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
        )
        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print("-" * 70)

        while True:
            label = input(f"Do you like '{perfume_name}' by {brand}? (Yes/No or type 'skip'): ").strip().capitalize()
            if label in ["Yes", "No"]:
                break
            elif label == "Skip":
                print(f"Skipping '{perfume_name}' for query: {user_query}")
                label = None
                break
            else:
                print("Please type 'Yes', 'No', or 'skip'. Try again.")

        notes_and_accords = (
            f"Top: {perfume['Top']} | "
            f"Middle: {perfume['Middle']} | "
            f"Base: {perfume['Base']} | "
            f"Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}"
        )

        if label is not None:
            new_feedback.append([
                user_query,
                perfume_name,
                brand,
                score.item(),
                label,
                notes_and_accords
            ])
            feedback_count += 1

    # Append new feedback to CSV
    if new_feedback:
        with open(feedback_path_non_standard, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(new_feedback)
        print(f"\nLogged {len(new_feedback)} new feedback entries.")
    else:
        print("\nNo new feedback was logged.")


Feedback file already exists — ready to append new rows.


In [None]:
# Loop through queries starting from a requested index
while True:
    start_index_input = input("Enter the query index: ").strip()
    if start_index_input == "" or start_index_input.isdigit():
        break
    else:
        print("Please enter valid number.")

start_index = int(start_index_input) - 1 if start_index_input else 0

for i, row in query_df_non_standard.iloc[start_index:].iterrows():
    user_query = row['query']
    print(f"\n==============================")
    print(f"Query {i+1}/{len(query_df_non_standard)}: {user_query}")
    print(f"==============================")

    recommend_perfumes_with_tags(user_query, top_k=3, max_feedback=3)

    cont = input("Press Enter to continue, type 'skip' to skip this query, or 'stop' to quit: ").strip().lower()

    if cont == 'stop':
        print("Stopping early.")
        break
    elif cont == 'skip':
        print(f"Skipping query {i+1}: {user_query}")
        continue

Enter the query index: 47

Query 47/50: What does it smell like in an industrial art gallery?

Generated Tags:
Such a perfume would probably contain notes of glass, urban, musk and amber, alcoholic, pharaone®, ambrocenide (symrise), varnish accord, white wine, oppulence, new magazine, credit cards, satin, asphault, iodine.
Such a perfume would probably contain notes of amberwood, kyphi, smoke, sylkolide, coffee, mulled wine, muscone, alcoholic, lava, ambreine, mystik


Christos by Giardini Di Toscana (Score: 0.777)
Top Notes: incense, amber xtreme, bergamot. Middle Notes: labdanum, patchouli, rose. Base Notes: incense, sandalwood, benzoin, pink pepper, vanilla, amber.
Main Accords: amber, woody, balsamic, warm spicy, smoky.

Sentiment: POSITIVE
----------------------------------------------------------------------
Do you like 'Christos' by Giardini Di Toscana? (Yes/No or type 'skip'): yes

Velvet Amber Skin by Dolce&Gabbana (Score: 0.774)
Top Notes: lavender, citruses, bergamot, silk t

## Compute Recall Scores

In [None]:
feedback_df = pd.read_csv(feedback_path_non_standard)

feedback_df["label"] = feedback_df["label"].str.lower().str.strip()

# Group feedback by query
grouped = feedback_df.groupby("query")

recall_scores = []
for query, group in grouped:
    total = len(group)
    relevant = sum(group["label"] == "yes")
    recall = relevant / total if total > 0 else 0
    recall_scores.append(recall)

# Average all ratio of yes
avg_recall = sum(recall_scores) / len(recall_scores)
print(f"Average fraction of relevant items in top-k: {avg_recall:.2f} over {len(recall_scores)} queries and {len(feedback_df)} feedback.")


Average fraction of relevant items in top-k: 0.88 over 34 queries and 101 feedback.
