This file generates data with notes, accords and reviews classified into "positive" and "negative". It also includes a section that recommends perfume with a Sentence-BERT model, using the aforementioned data.  **This version does not generate tags.**

# Check Data Existence

In [None]:
import os
import pandas as pd
import csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load data with "classify review" columns

combined_df_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df_classify_reviews.csv"

if os.path.exists(combined_df_path):
    print("Loading existing combined_df...\nSkip to Recommeded Perfumes")
    combined_df_classify_reviews = pd.read_csv(combined_df_path)
else:
    print("No combined_df, needs to processing raw data")

Loading existing combined_df...
Skip to Recommeded Perfumes


# Load Data (Without Reviews)

In [None]:
# get perfume data without reviews

file_path = '/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/fra_standard.csv'
with open(file_path, encoding='latin1') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        print(f"Line {i+1}: {row}")
        if i >= 9:  # print first 10 lines
            break

Line 1: ['', 'url', 'Perfume', 'Brand', 'Rating Count', 'Top', 'Middle', 'Base', 'mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']
Line 2: ['0', 'https://www.fragrantica.com/perfume/xerjoff/accento-overdose-pride-edition-74630.html', 'Accento Overdose Pride Edition', 'Xerjoff', '201', 'fruity notes, aldehydes, green notes', 'bulgarian rose, egyptian jasmine, lily-of-the-valley', 'eucalyptus, pine', 'rose', 'woody', 'fruity', 'aromatic', 'floral']
Line 3: ['1', 'https://www.fragrantica.com/perfume/jean-paul-gaultier/classique-pride-2024-90394.html', 'Classique Pride 2024', 'Jean Paul Gaultier', '70', 'yuzu, citruses', 'orange blossom, neroli', 'musk, blonde woods', 'citrus', 'white floral', 'sweet', 'fresh', 'musky']
Line 4: ['2', 'https://www.fragrantica.com/perfume/jean-paul-gaultier/classique-pride-2023-81775.html', 'Classique Pride 2023', 'Jean Paul Gaultier', '285', 'blood orange, yuzu', 'neroli, orange blossom', 'musk, white woods', 'citrus', 'white floral

In [None]:
df = pd.read_csv(
    file_path,
    sep=',', # specify correct separator
    encoding='latin1',
    low_memory=False
)

# Load Data (With Reviews)

In [None]:
# get perfume data with reviews

file_path = '/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/perfumes_table.csv'
with open(file_path, encoding='latin1') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        print(f"Line {i+1}: {row}")
        if i >= 9:  # print first 10 lines
            break

Line 1: ['rating', 'notes', 'designer', 'reviews', 'description', 'url', 'title']
Line 2: ['5.0', "['Vanila', 'Madagascar Vanilla', 'Vanilla Absolute']", 'fiorucci perfumes and colognes', '[]', 'Vanilla Scent by Fiorucci is a Amber Vanilla fragrance for women and men. Vanilla Scent was launched in 1961. Top note is Vanila; middle note is Madagascar Vanilla; base note is Vanilla Absolute.', 'https://www.fragrantica.com/perfume/Fiorucci/Vanilla-Scent-34262.html', 'Vanilla Scent Fiorucci for women and men']
Line 3: ['4.22', "['Mint', 'Lavender', 'Nutmeg', 'elemi', 'Cypress', 'Sandalwood', 'Saffron', 'Cypriol Oil or Nagarmotha', 'Amber', 'Patchouli', 'Precious Woods', 'Musk']", 'maiora parfum perfumes and colognes', '[]', 'Shady by Maiora Parfum is a fragrance for women and men. Shady was launched in 2019. The nose behind this fragrance is Antonio Gigli. Top notes are Mint, Lavender, Nutmeg and elemi; middle notes are Cypress, Sandalwood, Saffron and Cypriol Oil or Nagarmotha; base notes a

In [None]:
df_review = pd.read_csv(
    file_path,
    sep=',',            # specify correct separator
    encoding='latin1',
    low_memory=False
)

# Merge Data

In [None]:
# standardize URLs
df['url'] = df['url'].str.strip().str.lower().str.rstrip('/')
df_review['url'] = df_review['url'].str.strip().str.lower().str.rstrip('/')

# drop NaNs, empty strings, and whitespace-only reviews
df_review = df_review.dropna(subset=['reviews']).copy()
df_review['reviews'] = df_review['reviews'].astype(str).str.strip()
df_review = df_review[df_review['reviews'] != '']
df_review = df_review[df_review['reviews'] != '[]']

# group by URL and join reviews
agg_reviews = df_review.groupby('url')['reviews'].apply(','.join).reset_index()

# merge inner with notes
combined_df = pd.merge(df, agg_reviews, on='url', how='inner')

In [None]:
print(combined_df.columns)
print(combined_df.shape)
combined_df.head()

Index(['Unnamed: 0', 'url', 'Perfume', 'Brand', 'Rating Count', 'Top',
       'Middle', 'Base', 'mainaccord1', 'mainaccord2', 'mainaccord3',
       'mainaccord4', 'mainaccord5', 'reviews'],
      dtype='object')
(21950, 14)


Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but..."


In [None]:
print(combined_df['reviews'].iloc[0])

["I thought everybody was hating on this fragrance for the homophobia, turns out its a cash grab where none of the profits go to any lgbtq organisation. don't buy", "Above and beyond super yikes totally gross pink-washing money grab. Next time give 50-100% of profits from the sales to LGBTQ+ orgs, AT THE VERY LEAST, especially since y'all are selling it at that price. Please Xerjoff, spare some change for the gays.", 'Embarrassing pink money cash grab, and during Pride month nonetheless! Xerjoff needs to show us where they\'re donating because simply saying they "support" the LGBTQ community is not enough...', 'FYI: When asked if some LGBTQ+ organization would receive part of earning from this bottle, the PR team of Xerjoff decided to copy paste to everyone a answer without answer, a way to try to save face and also try to confuse you. Gaslighting. But donât get confused, itâs just a money grab.', "The Fragcomm is on fire today because it does appear to me Xerjoff (a high end Itali

In [None]:
print(combined_df['reviews'].iloc[1])

['Ummm... just like a honest review, ignoring all thisâ¦ other stuff. The blood orange scent is really strong, it gives off a bitter/tarte citrus opening (almost like grapefruit rind), but there not much else to it besides that. Honestly, I purchased thinking it was the original classique juice. Longevity is disappointing, about two-three hours. If you really like tarte bitter citrus this is great, and you should literally get it now, bc itâs discounted and a limited release.', '@AndyWarholsFavoriteWig:  \nAgreed, what the f is wrong with people. Like, I could imagine 12-year olds sitting and thinking that was hilarious... but I doubt they would be hanging out on fragrantica of all places, so we\'re talking probably adult men sitting and doing that shit. Like seriously, and I\'m an adult cis-gender straight(-ish) man saying this. \nI think some things that are immature, dark and not seen as "PC" can be very fun to joke about, but the thing with this is that it\'s just like... not fu

# Classify Reviews

In [None]:
# Flatten the review column
flat_reviews = (
    combined_df['reviews']
    .explode()
    .reset_index()
    .rename(columns={"index": "idx", "reviews": "review_text"})
)

In [None]:
print(flat_reviews)

         idx                                        review_text
0          0  ["I thought everybody was hating on this fragr...
1          1  ['Ummm... just like a honest review, ignoring ...
2          2  ['Lol to the reviewer below meâ¦no one cares,...
3          3  ['A bold fresh fragrance. Not recommended for ...
4          4  ["Tad bit sweeter than the other flankers, but...
...      ...                                                ...
21945  21945  ['Cardamom bomb \n2014 batch still very very g...
21946  21946  ['This is an incredibly butch(or perhaps switc...
21947  21947  ['Khaox is a different perfume. Green, fresh, ...
21948  21948  ['Ive been wanting this fragrance ever since I...
21949  21949  ["This feels like Mad Men in a bottle, it's be...

[21950 rows x 2 columns]


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

def classify_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]

flat_reviews['sentiment'] = flat_reviews['review_text'].progress_apply(classify_sentiment)
flat_reviews['is_positive'] = flat_reviews['sentiment'].map({'POSITIVE': 1, 'NEGATIVE': 0})

100%|██████████| 21950/21950 [30:29<00:00, 12.00it/s]


In [None]:
print(flat_reviews.columns)
print(flat_reviews)

Index(['idx', 'review_text', 'sentiment', 'is_positive'], dtype='object')
         idx                                        review_text sentiment  \
0          0  ["I thought everybody was hating on this fragr...  NEGATIVE   
1          1  ['Ummm... just like a honest review, ignoring ...  NEGATIVE   
2          2  ['Lol to the reviewer below meâ¦no one cares,...  NEGATIVE   
3          3  ['A bold fresh fragrance. Not recommended for ...  POSITIVE   
4          4  ["Tad bit sweeter than the other flankers, but...  POSITIVE   
...      ...                                                ...       ...   
21945  21945  ['Cardamom bomb \n2014 batch still very very g...  POSITIVE   
21946  21946  ['This is an incredibly butch(or perhaps switc...  POSITIVE   
21947  21947  ['Khaox is a different perfume. Green, fresh, ...  POSITIVE   
21948  21948  ['Ive been wanting this fragrance ever since I...  POSITIVE   
21949  21949  ["This feels like Mad Men in a bottle, it's be...  POSITIVE   

 

In [None]:
sentiment_summary = flat_reviews.groupby('idx').agg({
    'review_text': list,
    'sentiment': list,
    'is_positive': list
}).rename(columns={
    'review_text': 'review_texts',
    'sentiment': 'review_sentiments',
    'is_positive': 'review_labels'
})

In [None]:
# combined_df = combined_df.reset_index()
combined_df_classify_reviews = combined_df.merge(flat_reviews, left_index=True, right_on='idx', how='left')
combined_df_classify_reviews.drop(columns=['idx', 'review_text'], inplace=True)

In [None]:
combined_df.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but..."


In [None]:
# save combined data to avoid reruning next time
combined_df_classify_reviews.to_csv("/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df_classify_reviews.csv", index=False)

In [None]:
combined_df_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df.csv"

if os.path.exists(combined_df_path):
    print("Loading existing combined_df...\nSkip to Sentence-BERT")
    combined_df = pd.read_csv(combined_df_path)
else:
    print("No combined_df, needs to processing raw data")

Loading existing combined_df...
Skip to Sentence-BERT


In [None]:
# combined_df has full_description column, but doesn't have sentiment and is_positive
combined_df.shape

(21950, 15)

In [None]:
combined_df.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,full_description
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,I thought everybody was hating on this fragran...,"Top Notes: fruity notes, aldehydes, green note..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...","Top Notes: blood orange, yuzu. Middle Notes: n..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...","Top Notes: mint, lavender, cardamom, artemisia..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,"Top Notes: yuzu, blood orange. Middle Notes: n..."
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"Tad bit sweeter than the other flankers, but a...","Top Notes: cranberry, pink grapefruit, citron...."


In [None]:
# combined_df_classify_reviews has sentiment and is_positive columns, but doesn't have full_description column
combined_df_classify_reviews.shape

(21950, 16)

In [None]:
combined_df_classify_reviews.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,sentiment,is_positive
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr...",NEGATIVE,0
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...",NEGATIVE,0
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...",NEGATIVE,0
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,POSITIVE,1
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but...",POSITIVE,1


In [None]:
# merge combined_df and combined_df_classify_reviews to have the data that
# has full_description, sentiment and is_positive columns
merged_df = pd.merge(
    combined_df_classify_reviews,
    combined_df[['url', 'Perfume', 'full_description']],
    on=['url', 'Perfume'],
    how='left'
)

In [None]:
merged_df.drop("full_description", axis=1, inplace=True)
merged_df.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,sentiment,is_positive
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr...",NEGATIVE,0
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...",NEGATIVE,0
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...",NEGATIVE,0
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,POSITIVE,1
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but...",POSITIVE,1


In [None]:
# construct a column with a full description so that BERT can understand

def build_full_description(row):
    # so that BERT can understand different fields

    top = row.get('Top', '')
    middle = row.get('Middle', '')
    base = row.get('Base', '')

    accords = ', '.join([
        str(row.get('mainaccord1', '')),
        str(row.get('mainaccord2', '')),
        str(row.get('mainaccord3', '')),
        str(row.get('mainaccord4', '')),
        str(row.get('mainaccord5', ''))
    ])

    reviews = row.get('reviews', '')

    description = (
        f"Top Notes: {top}. "
        f"Middle Notes: {middle}. "
        f"Base Notes: {base}. "
        f"Main Accords: {accords}. "
    )

    return description.strip()

merged_df['full_description'] = merged_df.apply(build_full_description, axis=1)

In [None]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,sentiment,is_positive,full_description
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr...",NEGATIVE,0,"Top Notes: fruity notes, aldehydes, green note..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...",NEGATIVE,0,"Top Notes: blood orange, yuzu. Middle Notes: n..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...",NEGATIVE,0,"Top Notes: mint, lavender, cardamom, artemisia..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,POSITIVE,1,"Top Notes: yuzu, blood orange. Middle Notes: n..."
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but...",POSITIVE,1,"Top Notes: cranberry, pink grapefruit, citron...."


In [None]:
merged_df.iloc[0]['full_description']

'Top Notes: fruity notes, aldehydes, green notes. Middle Notes: bulgarian rose, egyptian jasmine, lily-of-the-valley. Base Notes: eucalyptus, pine. Main Accords: rose, woody, fruity, aromatic, floral.'

In [None]:
# save combined data to avoid reruning next time
merged_df.to_csv("/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df_classify_reviews.csv", index=False)

# Recommend Perfumes with Classified Reviews


In [None]:
combined_df_classify_reviews.shape

(21950, 17)

In [None]:
combined_df_classify_reviews.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,sentiment,is_positive,full_description
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr...",NEGATIVE,0,"Top Notes: fruity notes, aldehydes, green note..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...",NEGATIVE,0,"Top Notes: blood orange, yuzu. Middle Notes: n..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...",NEGATIVE,0,"Top Notes: mint, lavender, cardamom, artemisia..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,POSITIVE,1,"Top Notes: yuzu, blood orange. Middle Notes: n..."
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but...",POSITIVE,1,"Top Notes: cranberry, pink grapefruit, citron...."


In [None]:
combined_df_classify_reviews.iloc[0]['full_description']

'Top Notes: fruity notes, aldehydes, green notes. Middle Notes: bulgarian rose, egyptian jasmine, lily-of-the-valley. Base Notes: eucalyptus, pine. Main Accords: rose, woody, fruity, aromatic, floral.'

In [None]:
!pip install -U sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from tqdm import tqdm

In [None]:
s_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# save the perfume_embeddings for later use.
# it takes some time to build the perfume_embeddings every time

import os
import torch

save_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/perfume_embeddings.pt"

if not os.path.exists(save_path):
    descriptions = combined_df_classify_reviews["full_description"].tolist()
    weights = combined_df_classify_reviews["is_positive"].apply(lambda x: 1.0 if x == 1 else 0.3).tolist()
    embeddings = s_model.encode(
        descriptions,
        batch_size=32,
        convert_to_tensor=True,
        show_progress_bar=True
    )

    # Apply weights
    weights_tensor = torch.tensor(weights, device=embeddings.device).unsqueeze(1)
    weighted_embeddings = embeddings * weights_tensor

    torch.save(weighted_embeddings, save_path)
    print("Perfume embeddings with weighted reviews saved.")
else:
    perfume_embeddings = torch.load(save_path)
    print("Perfumes embeddings already exist — loading file.")

Perfumes embeddings already exist — loading file.


In [None]:
from sentence_transformers import util
import torch

def recommend_perfumes(user_query, top_k=5):
    query_embedding = s_model.encode(user_query, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]
    top_results = torch.topk(similarities, k=top_k)

    print(f"\nUser Query: {user_query}\n")
    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item() # convert Python tensor to int
        perfume = combined_df_classify_reviews.loc[idx]

        print(f"{perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f})")
        short_desc = (
          f"Top Notes: {perfume['Top']}. "
          f"Middle Notes: {perfume['Middle']}. "
          f"Base Notes: {perfume['Base']}.\n"
          f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
          )

        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print(f"--------------------------------------------------------------------")


In [None]:
recommend_perfumes("I want something musky and warm with amber and vanilla", top_k=3)


User Query: I want something musky and warm with amber and vanilla

Vanille Eau de Parfum by Laura Mercier (Score: 0.669)
Top Notes: vanilla, vanilla orchid. Middle Notes: vanilla, heliotrope, daylily, musk. Base Notes: bourbon vanilla, sandalwood, amber.
Main Accords: powdery, vanilla, amber, floral, musky.

Sentiment: POSITIVE
--------------------------------------------------------------------
Midnight Journey by Burberry (Score: 0.654)
Top Notes: ginger, black pepper, thyme. Middle Notes: guaiac wood, may rose, lavender. Base Notes: vanilla absolute, black amber, myrrh.
Main Accords: amber, vanilla, warm spicy, woody, balsamic.

Sentiment: POSITIVE
--------------------------------------------------------------------
Gold + by Commodity (Score: 0.652)
Top Notes: amber, vanilla, sandalwood. Middle Notes: vanilla, iso e super. Base Notes: saffron, patchouli, nutmeg.
Main Accords: woody, warm spicy, amber, vanilla, powdery.

Sentiment: POSITIVE
----------------------------------------

In [None]:
recommend_perfumes("I want something fresh and floral with soft woody notes", top_k=3)


User Query: I want something fresh and floral with soft woody notes

Autograph New York M by rks & Spencer (Score: 0.689)
Top Notes: apricot, gardenia, bergamot. Middle Notes: woody notes, rose, jasmine. Base Notes: vanilla, musk, heliotrope, sandalwood.
Main Accords: woody, powdery, vanilla, white floral, musky.

Sentiment: POSITIVE
--------------------------------------------------------------------
Agent Provocateur Lace by Agent Provocateur (Score: 0.688)
Top Notes: floral notes, kumquat, saffron, bergamot. Middle Notes: wild orchid, orange blossom, bitter orange, periwinkle. Base Notes: amberwood, musk, cedar.
Main Accords: citrus, floral, woody, white floral, powdery.

Sentiment: POSITIVE
--------------------------------------------------------------------
Truly Lace by Coty (Score: 0.687)
Top Notes: fruity notes, orange blossom, gardenia, green notes, bergamot. Middle Notes: jasmine, tuberose, lily-of-the-valley, ylang-ylang, rose, orchid. Base Notes: vanilla, amber, sandalwood

# Test (Standard Queries)


## Collect Perfume Data

In [None]:
# Collect notes, accords, occasions arrays to create random user queries
import re
from collections import Counter

note_columns = ["Top", "Middle", "Base"]
accord_columns = [f"mainaccord{i}" for i in range(1, 6)]

def extract_notes_with_threshold(df, columns, min_count=5):
    """
    Extract notes that show up at least min_count times.
    This is to reduce rare nodes, random noise.
    """
    all_notes = []
    for col in columns:
        df[col] = df[col].fillna("")
        split_notes = df[col].str.lower().str.split(",")
        for note_list in split_notes:
            for note in note_list:
                cleaned = re.sub(r"[^a-z\s]", "", note.strip())
                if cleaned and len(cleaned) > 2:
                    all_notes.append(cleaned)

    note_counts = Counter(all_notes)
    filtered = [note for note, count in note_counts.items() if count >= min_count]
    return sorted(filtered)

def extract_accords_with_threshold(df, columns, min_count=3):
    all_accords = []
    for col in columns:
        df[col] = df[col].fillna("")
        for entry in df[col]:
            cleaned = re.sub(r"[^a-z\s]", "", entry.lower().strip())
            if cleaned and len(cleaned) > 2:
                all_accords.append(cleaned)

    accord_counts = Counter(all_accords)
    filtered = [accord for accord, count in accord_counts.items() if count >= min_count]
    return sorted(filtered)

notes = extract_notes_with_threshold(combined_df_classify_reviews, note_columns)
accords = extract_accords_with_threshold(combined_df_classify_reviews, accord_columns)

occasions = [
    "for spring", "for summer", "for fall", "for winter",
    "for date night", "for going out", "for everyday wear", "for special occasions",
    "for the office", "for gym", "for cozy nights", "for beach days",
    "for weddings", "for graduation"
]

In [None]:
print(notes)
len(notes)

['absinthe', 'acai berry', 'accia', 'accord eudora', 'african geranium', 'african ginger', 'african orange flower', 'african violet', 'agarwood', 'agarwood oud', 'agave', 'akigalawood', 'aldehydes', 'algae', 'allspice', 'almond', 'almond blossom', 'almond milk', 'almond tree', 'aloe vera', 'amalfi lemon', 'amaretto', 'amaryllis', 'amber', 'amber xtreme', 'ambergris', 'ambertonic', 'amberwood', 'ambrarome', 'ambretone', 'ambrette', 'ambrette musk mallow', 'ambrettolide', 'ambrocenide', 'ambrofix', 'ambroxan', 'amyl salicylate', 'amyris', 'angelica', 'animal notes', 'anise', 'apple', 'apple blossom', 'apple tree', 'apple tree blossom', 'apricot', 'apricot blossom', 'aromatic notes', 'aromatic spices', 'artemisia', 'arum lily', 'asafoetida', 'ash', 'asphalt', 'atlas cedar', 'australian sandalwood', 'baie rose', 'balsam fir', 'balsamic notes', 'bamboo', 'bamboo leaf', 'banana', 'banana leaf', 'barberry', 'barley', 'basil', 'bay leaf', 'beeswax', 'bellflower', 'bellini', 'benzoin', 'bergamo

911

In [None]:
print(accords)
len(accords)

['alcohol', 'aldehydic', 'almond', 'amber', 'animalic', 'anis', 'aquatic', 'aromatic', 'asphault', 'balsamic', 'beeswax', 'bitter', 'cacao', 'camphor', 'cannabis', 'caramel', 'champagne', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clay', 'cocacola', 'coconut', 'coffee', 'conifer', 'earthy', 'floral', 'fresh', 'fresh spicy', 'fruity', 'green', 'herbal', 'honey', 'iris', 'lactonic', 'lavender', 'leather', 'marine', 'metallic', 'mineral', 'mossy', 'musky', 'nutty', 'oriental', 'oud', 'ozonic', 'patchouli', 'powdery', 'rose', 'rum', 'salty', 'sand', 'savory', 'smoky', 'soapy', 'soft spicy', 'sour', 'spicy', 'sweet', 'tobacco', 'tropical', 'tuberose', 'vanilla', 'vinyl', 'violet', 'vodka', 'warm spicy', 'whiskey', 'white floral', 'wine', 'woody', 'yellow floral']


73

## Create Random User Queries

In [None]:
import random

def generate_random_query(notes, accords, occasions):
    note1, note2 = random.sample(notes, 2)
    accord = random.choice(accords)
    occasion = random.choice(occasions)
    return f"I want something {accord} with {note1} and {note2} {occasion}."


In [None]:
num_queries = 50
query_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/standard_user_queries.csv"

# Check if file exists
if os.path.exists(query_path):
    print(f"Loading existing user_queries...")
    query_df = pd.read_csv(query_path)

else:
    print("No user_queries, needs to generating new queries...")

    # Generate new queries
    generated_queries = [generate_random_query(notes, accords, occasions) for _ in range(num_queries)]
    generated_queries = list(set(generated_queries))  # remove duplicates

    query_df = pd.DataFrame({"query": generated_queries})
    query_df.to_csv(query_path, index=False)

    print(f"Saved {num_queries} user queries to {query_path}")

query_df.head()


Loading existing user_queries...


Unnamed: 0,query
0,I want something sand with cherry and hiacynth...
1,I want something tobacco with coriander and pl...
2,I want something anis with tobacco blossom and...
3,I want something cannabis with magnolia petals...
4,I want something iris with juniper and hazelnu...


## Build Feedback File

In [None]:
feedback_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/without-tags/standard_user_feedback_without_tags.csv"

if not os.path.exists(feedback_path):
    with open(feedback_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["query", "perfume_name", "brand", "score", "label", "notes_and_accords"])
    print("Created new feedback file.")
else:
    print("Feedback file already exists — ready to append new rows.")


def recommend_perfumes(user_query, top_k=5, max_feedback=3):
    """
    Recommend perfumes based on the raw query (no tags).
    Ask for Yes/No feedback on up to max_feedback perfumes.
    """

    query_embedding = s_model.encode(user_query, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]

    adjusted_scores = []
    for idx, score in enumerate(similarities):
        sentiment_boost = 1.2 if combined_df_classify_reviews.loc[idx, 'is_positive'] == 1 else 0.8
        adjusted_scores.append(score.item() * sentiment_boost)

    adjusted_scores = torch.tensor(adjusted_scores)
    top_results = torch.topk(adjusted_scores, k=top_k)

    # print(f"\nUser Query: {user_query}\n")

    # Only display a number of user queries until reaching max_feedback
    feedback_count = 0
    new_feedback = []
    for score, idx in zip(top_results.values, top_results.indices):
        if feedback_count >= max_feedback:
            break

        idx = idx.item()
        perfume = combined_df_classify_reviews.loc[idx]
        perfume_name = perfume['Perfume']
        brand = perfume['Brand']

        print(f"\n{perfume_name} by {brand} (Score: {score.item():.3f})")
        short_desc = (
            f"Top Notes: {perfume['Top']}. "
            f"Middle Notes: {perfume['Middle']}. "
            f"Base Notes: {perfume['Base']}.\n"
            f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
        )
        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print("-" * 70)

        while True:
            label = input(f"Do you like '{perfume_name}' by {brand}? (Yes/No or type 'skip'): ").strip().capitalize()
            if label in ["Yes", "No"]:
                break
            elif label == "Skip":
                print(f"Skipping '{perfume_name}' for query: {user_query}")
                label = None
                break
            else:
                print("Please type 'Yes', 'No', or 'skip'. Try again.")

        notes_and_accords = (
            f"Top: {perfume['Top']} | "
            f"Middle: {perfume['Middle']} | "
            f"Base: {perfume['Base']} | "
            f"Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}"
        )

        if label is not None:
            new_feedback.append([
                user_query,
                perfume_name,
                brand,
                score.item(),
                label,
                notes_and_accords
            ])
            feedback_count += 1

    # Append new feedback to CSV
    if new_feedback:
        with open(feedback_path, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(new_feedback)
        print(f"\nLogged {len(new_feedback)} new feedback entries.")
    else:
        print("\nNo new feedback was logged.")


Feedback file already exists — ready to append new rows.


In [None]:
# Loop through queries starting from a requested index
while True:
    start_index_input = input("Enter the query index: ").strip()
    if start_index_input == "" or start_index_input.isdigit():
        break
    else:
        print("Please enter valid number.")

start_index = int(start_index_input) - 1 if start_index_input else 0

for i, row in query_df.iloc[start_index:].iterrows():
    user_query = row['query']
    print(f"\n==============================")
    print(f"Query {i+1}/{len(query_df)}: {user_query}")
    print(f"==============================")

    recommend_perfumes(user_query, top_k=3, max_feedback=3)

    cont = input("Press Enter to continue, type 'skip' to skip this query, or 'stop' to quit: ").strip().lower()

    if cont == 'stop':
        print("Stopping early.")
        break
    elif cont == 'skip':
        print(f"Skipping query {i+1}: {user_query}")
        continue

KeyboardInterrupt: Interrupted by user

## Compute Relevance Scores

In [None]:
feedback_df = pd.read_csv(feedback_path)

feedback_df["label"] = feedback_df["label"].str.lower().str.strip()

# Group feedback by query
grouped = feedback_df.groupby("query")

recall_scores = []
for query, group in grouped:
    total = len(group)
    relevant = sum(group["label"] == "yes")
    recall = relevant / total if total > 0 else 0
    recall_scores.append(recall)

# Average all ratio of yes
avg_recall = sum(recall_scores) / len(recall_scores)
print(f"Average fraction of relevant items in top-k: {avg_recall:.2f} over {len(recall_scores)} queries and {len(feedback_df)} feedback")


Average fraction of relevant items in top-k: 0.89 over 34 queries and 102 feedback


# Test (Non-standard Queries)

## Create Non-standard User Queries


In [None]:
import json
import pandas as pd

num_queries = 50
query_path_non_standard = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/non_standard_user_queries.csv"


# Check if file exists
if os.path.exists(query_path_non_standard):
    print(f"Loading existing user_queries...")
    query_df_non_standard = pd.read_csv(query_path_non_standard)

else:
    print("No user_queries, needs to generating new queries...")

    jsonl_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/training_data.jsonl"

    instructions = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            if "instruction" in data:
                instructions.append(data["instruction"])
            if len(instructions) == 50:
                break

    query_df_non_standard = pd.DataFrame({"query": instructions})
    query_df_non_standard.to_csv(query_path_non_standard, index=False) # save to CSV

query_df_non_standard.head()

Loading existing user_queries...


Unnamed: 0,query
0,What perfumes capture the essence of a natural...
1,scent of a confident adventure
2,perfume for a womanly mansion
3,I'm going to a homey alchemy lab. What scents ...
4,I'm going to a hiking sea. What scents would y...


## Build Feedback File

In [None]:
feedback_path_non_standard = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/without-tags/non_standard_user_feedback_without_tags.csv"

if not os.path.exists(feedback_path_non_standard):
    with open(feedback_path_non_standard, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["query", "perfume_name", "brand", "score", "label", "notes_and_accords"])
    print("Created new feedback file.")
else:
    print("Feedback file already exists — ready to append new rows.")


def recommend_perfumes(user_query, top_k=5, max_feedback=3):
    """
    Recommend perfumes based on the raw query (no tags).
    Ask for Yes/No feedback on up to max_feedback perfumes.
    """

    query_embedding = s_model.encode(user_query, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]

    adjusted_scores = []
    for idx, score in enumerate(similarities):
        sentiment_boost = 1.2 if combined_df_classify_reviews.loc[idx, 'is_positive'] == 1 else 0.8
        adjusted_scores.append(score.item() * sentiment_boost)

    adjusted_scores = torch.tensor(adjusted_scores)
    top_results = torch.topk(adjusted_scores, k=top_k)

    # print(f"\nUser Query: {user_query}\n")

    # Only display a number of user queries until reaching max_feedback
    feedback_count = 0
    new_feedback = []
    for score, idx in zip(top_results.values, top_results.indices):
        if feedback_count >= max_feedback:
            break

        idx = idx.item()
        perfume = combined_df_classify_reviews.loc[idx]
        perfume_name = perfume['Perfume']
        brand = perfume['Brand']

        print(f"\n{perfume_name} by {brand} (Score: {score.item():.3f})")
        short_desc = (
            f"Top Notes: {perfume['Top']}. "
            f"Middle Notes: {perfume['Middle']}. "
            f"Base Notes: {perfume['Base']}.\n"
            f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
        )
        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print("-" * 70)

        while True:
            label = input(f"Do you like '{perfume_name}' by {brand}? (Yes/No or type 'skip'): ").strip().capitalize()
            if label in ["Yes", "No"]:
                break
            elif label == "Skip":
                print(f"Skipping '{perfume_name}' for query: {user_query}")
                label = None
                break
            else:
                print("Please type 'Yes', 'No', or 'skip'. Try again.")

        notes_and_accords = (
            f"Top: {perfume['Top']} | "
            f"Middle: {perfume['Middle']} | "
            f"Base: {perfume['Base']} | "
            f"Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}"
        )

        if label is not None:
            new_feedback.append([
                user_query,
                perfume_name,
                brand,
                score.item(),
                label,
                notes_and_accords
            ])
            feedback_count += 1

    # Append new feedback to CSV
    if new_feedback:
        with open(feedback_path, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(new_feedback)
        print(f"\nLogged {len(new_feedback)} new feedback entries.")
    else:
        print("\nNo new feedback was logged.")


Created new feedback file.


## Compute Relevance Scores

In [None]:
feedback_df = pd.read_csv(feedback_path_non_standard)

feedback_df["label"] = feedback_df["label"].str.lower().str.strip()

# Group feedback by query
grouped = feedback_df.groupby("query")

recall_scores = []
for query, group in grouped:
    total = len(group)
    relevant = sum(group["label"] == "yes")
    recall = relevant / total if total > 0 else 0
    recall_scores.append(recall)

# Average all ratio of yes
avg_recall = sum(recall_scores) / len(recall_scores)
print(f"Average fraction of relevant items in top-k: {avg_recall:.2f} over {len(recall_scores)} queries and {len(feedback_df)} feedback")