This file generates data with notes, accords and reviews classified into "positive" and "negative". It also includes a section that recommends perfume with a Sentence-BERT model, using the aforementioned data.  **This version generates tags.**

In [1]:
# Install libraries
!pip install -q --upgrade fsspec==2025.3.2
!pip install -q transformers datasets accelerate peft trl bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 req

# Check Data Existence

In [2]:
import os
import pandas as pd
import csv

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load data with "classify review" columns

combined_df_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/combined_df_classify_reviews.csv"

if os.path.exists(combined_df_path):
    print("Loading existing combined_df...\nSkip to Recommeded Perfumes")
    combined_df_classify_reviews = pd.read_csv(combined_df_path)
else:
    print("No combined_df, needs to processing raw data")

Loading existing combined_df...
Skip to Recommeded Perfumes


# Load Tag Generation LLM

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/scent-model-final"
quantized_model_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/scent-model-final-4bit"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Check if quantized model already exists
if os.path.exists(quantized_model_path):
    print("Quantized model found — loading directly...")
    model = AutoModelForCausalLM.from_pretrained(
        quantized_model_path,
        device_map="auto",
        quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)

else:
    print("Quantized model not found — loading base model and quantizing...")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Save the quantized version
    model.save_pretrained(quantized_model_path)
    tokenizer.save_pretrained(quantized_model_path)

Quantized model found — loading directly...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch

model.eval()

def generate_tags(prompt, max_new_tokens=150):
    input_text = f"### Instruction:\n{prompt}\n\n### Response:\n"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded.split("### Response:")[-1].strip()

In [None]:
generate_tags("What's a good scent for the last day of autumn?")

'old paper, musty / dry\npetrichor, earthy / fresh\nrosewater, sweet / romantic\nhoney, sweet / floral\ncandle wax, warm / fatty\ncedar, woody / grounding\njasmine, floral / ethereal\npine smoke, woody / smoky\nwet bark, woody / humid\nmint, cool / crisp\nburnt sugar, bitter / sweet\nambergris, musky / marine\ncoal smoke, sooty / dense\nsea breeze, salty / seaward\nspiced vanilla, sweet / warming\nozone, sharp / clean\nengine oil, metallic /'

# Recommend Perfumes with Classified Reviews


In [None]:
combined_df_classify_reviews.shape

(21950, 17)

In [None]:
combined_df_classify_reviews.head()

Unnamed: 0.1,Unnamed: 0,url,Perfume,Brand,Rating Count,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,reviews,sentiment,is_positive,full_description
0,0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,201,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,"[""I thought everybody was hating on this fragr...",NEGATIVE,0,"Top Notes: fruity notes, aldehydes, green note..."
1,2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,285,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,"['Ummm... just like a honest review, ignoring ...",NEGATIVE,0,"Top Notes: blood orange, yuzu. Middle Notes: n..."
2,4,https://www.fragrantica.com/perfume/jean-paul-...,Le MÃ¢le Pride Collector,Jean Paul Gaultier,632,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"['Lol to the reviewer below meâ¦no one cares,...",NEGATIVE,0,"Top Notes: mint, lavender, cardamom, artemisia..."
3,5,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride 2023,Jean Paul Gaultier,531,"yuzu, blood orange","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,green,['A bold fresh fragrance. Not recommended for ...,POSITIVE,1,"Top Notes: yuzu, blood orange. Middle Notes: n..."
4,7,https://www.fragrantica.com/perfume/ralph-laur...,Polo Red Pride Edition,Ralph Lauren,107,"cranberry, pink grapefruit, citron","clary sage, lavender, saffron","amberwood, brazilian redwood, coffee",woody,amber,citrus,warm spicy,aromatic,"[""Tad bit sweeter than the other flankers, but...",POSITIVE,1,"Top Notes: cranberry, pink grapefruit, citron...."


In [None]:
combined_df_classify_reviews.iloc[0]['full_description']

'Top Notes: fruity notes, aldehydes, green notes. Middle Notes: bulgarian rose, egyptian jasmine, lily-of-the-valley. Base Notes: eucalyptus, pine. Main Accords: rose, woody, fruity, aromatic, floral.'

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 4.1.0
    Uninstalling sentence-transformers-4.1.0:
      Successfully uninstalled sentence-transformers-4.1.0
Successfully installed sentence-transformers-5.0.0


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from tqdm import tqdm

In [None]:
s_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# save the perfume_embeddings for later use.
# it takes some time to build the perfume_embeddings every time

import os
import torch

save_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/perfume_embeddings.pt"

if not os.path.exists(save_path):
    descriptions = combined_df_classify_reviews["full_description"].tolist()
    weights = combined_df_classify_reviews["is_positive"].apply(lambda x: 1.0 if x == 1 else 0.3).tolist()
    embeddings = s_model.encode(
        descriptions,
        batch_size=32,
        convert_to_tensor=True,
        show_progress_bar=True
    )

    # Apply weights
    weights_tensor = torch.tensor(weights, device=embeddings.device).unsqueeze(1)
    weighted_embeddings = embeddings * weights_tensor

    torch.save(weighted_embeddings, save_path)
    print("Perfume embeddings with weighted reviews saved.")
else:
    perfume_embeddings = torch.load(save_path)
    print("Perfumes embeddings already exist — loading file.")

Batches:   0%|          | 0/686 [00:00<?, ?it/s]

Perfume embeddings with weighted reviews saved.


In [None]:
from sentence_transformers import util
import torch

def recommend_perfumes_with_tags(user_query, top_k=5):
    tags_text = generate_tags(user_query)
    print(f"\nUser Query: {user_query}\n")
    print(f"\nGenerated Tags:\n{tags_text}\n")

    query_embedding = s_model.encode(tags_text, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]

    adjusted_scores = [] # boost scores of positive perfumes
    for idx, score in enumerate(similarities):
        sentiment_boost = 1.2 if combined_df_classify_reviews.loc[idx, 'is_positive'] == 1 else 0.8
        adjusted_scores.append(score.item() * sentiment_boost)

    adjusted_scores = torch.tensor(adjusted_scores)
    top_results = torch.topk(similarities, k=top_k)

    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item() # convert Python tensor to int
        perfume = combined_df_classify_reviews.loc[idx]

        print(f"{perfume['Perfume']} by {perfume['Brand']} (Score: {score.item():.3f})")
        short_desc = (
          f"Top Notes: {perfume['Top']}. "
          f"Middle Notes: {perfume['Middle']}. "
          f"Base Notes: {perfume['Base']}.\n"
          f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
          )

        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print(f"--------------------------------------------------------------------")


In [None]:
recommend_perfumes_with_tags("I want something musky and warm with amber and vanilla", top_k=3)


User Query: I want something musky and warm with amber and vanilla


Generated Tags:
cedar, woody / grounding
ambergris, musky / marine
lavender, floral / calming
ozone, sharp / clean
leather, rich / worn
old paper, musty / dry
petrichor, earthy / fresh
pine smoke, woody / smoky
sulfur, sharp / acrid
rosewater, sweet / romantic
sea breeze, salty / marine
coal smoke, sooty / dense
candle wax, warm / fatty
jasmine, floral / ethereal
spiced sugar, sweet / spicy
wet bark, woody / humid
burnt sugar, bitter / sweet

La Fumee by Miller Harris (Score: 0.786)
Top Notes: incense, elemi, lavender. Middle Notes: cardamom, coriander, cumin, chamomile, geranium. Base Notes: french labdanum, sandalwood, birch, moroccan cedar, amber, agarwood (oud).
Main Accords: amber, woody, aromatic, smoky, balsamic.

Sentiment: POSITIVE
--------------------------------------------------------------------
Memoir Man by Amouage (Score: 0.785)
Top Notes: wormwood, mint, basil. Middle Notes: incense, lavender, rose. 

In [None]:
recommend_perfumes_with_tags("I want something fresh and floral with soft woody notes", top_k=3)


User Query: I want something fresh and floral with soft woody notes


Generated Tags:
cedar, woody / grounding
lavender, floral / calming
old paper, musty / dry
candle wax, warm / fatty
burnt sugar, bitter / sweet
pine smoke, woody / smoky
engine oil, metallic / mechanical
petrichor, earthy / fresh
ozone, sharp / clean
ambergris, musky / marine
mint, cool / crisp
coal smoke, sooty / dense
jasmine, floral / ethereal
wet bark, woody / humid
rosewater, sweet / romantic
spiced vanilla, sweet / warming
sea breeze, salty /

Memoir Man by Amouage (Score: 0.787)
Top Notes: wormwood, mint, basil. Middle Notes: incense, lavender, rose. Base Notes: tobacco, leather, sandalwood, guaiac wood, vetiver, oak moss, amber, vanille, musk.
Main Accords: aromatic, fresh spicy, woody, amber, green.

Sentiment: NEGATIVE
--------------------------------------------------------------------
La Fumee by Miller Harris (Score: 0.783)
Top Notes: incense, elemi, lavender. Middle Notes: cardamom, coriander, cumin, c

# Test


## Collect Perfume Data

In [None]:
# Collect notes, accords, occasions arrays to create random user queries
import re
from collections import Counter

note_columns = ["Top", "Middle", "Base"]
accord_columns = [f"mainaccord{i}" for i in range(1, 6)]

def extract_notes_with_threshold(df, columns, min_count=5):
    """
    Extract notes that show up at least min_count times.
    This is to reduce rare nodes, random noise.
    """
    all_notes = []
    for col in columns:
        df[col] = df[col].fillna("")
        split_notes = df[col].str.lower().str.split(",")
        for note_list in split_notes:
            for note in note_list:
                cleaned = re.sub(r"[^a-z\s]", "", note.strip())
                if cleaned and len(cleaned) > 2:
                    all_notes.append(cleaned)

    note_counts = Counter(all_notes)
    filtered = [note for note, count in note_counts.items() if count >= min_count]
    return sorted(filtered)

def extract_accords_with_threshold(df, columns, min_count=3):
    all_accords = []
    for col in columns:
        df[col] = df[col].fillna("")
        for entry in df[col]:
            cleaned = re.sub(r"[^a-z\s]", "", entry.lower().strip())
            if cleaned and len(cleaned) > 2:
                all_accords.append(cleaned)

    accord_counts = Counter(all_accords)
    filtered = [accord for accord, count in accord_counts.items() if count >= min_count]
    return sorted(filtered)

notes = extract_notes_with_threshold(combined_df_classify_reviews, note_columns)
accords = extract_accords_with_threshold(combined_df_classify_reviews, accord_columns)

occasions = [
    "for spring", "for summer", "for fall", "for winter",
    "for date night", "for going out", "for everyday wear", "for special occasions",
    "for the office", "for gym", "for cozy nights", "for beach days",
    "for weddings", "for graduation"
]

In [None]:
print(notes)
len(notes)

['absinthe', 'acai berry', 'accia', 'accord eudora', 'african geranium', 'african ginger', 'african orange flower', 'african violet', 'agarwood', 'agarwood oud', 'agave', 'akigalawood', 'aldehydes', 'algae', 'allspice', 'almond', 'almond blossom', 'almond milk', 'almond tree', 'aloe vera', 'amalfi lemon', 'amaretto', 'amaryllis', 'amber', 'amber xtreme', 'ambergris', 'ambertonic', 'amberwood', 'ambrarome', 'ambretone', 'ambrette', 'ambrette musk mallow', 'ambrettolide', 'ambrocenide', 'ambrofix', 'ambroxan', 'amyl salicylate', 'amyris', 'angelica', 'animal notes', 'anise', 'apple', 'apple blossom', 'apple tree', 'apple tree blossom', 'apricot', 'apricot blossom', 'aromatic notes', 'aromatic spices', 'artemisia', 'arum lily', 'asafoetida', 'ash', 'asphalt', 'atlas cedar', 'australian sandalwood', 'baie rose', 'balsam fir', 'balsamic notes', 'bamboo', 'bamboo leaf', 'banana', 'banana leaf', 'barberry', 'barley', 'basil', 'bay leaf', 'beeswax', 'bellflower', 'bellini', 'benzoin', 'bergamo

911

In [None]:
print(accords)
len(accords)

['alcohol', 'aldehydic', 'almond', 'amber', 'animalic', 'anis', 'aquatic', 'aromatic', 'asphault', 'balsamic', 'beeswax', 'bitter', 'cacao', 'camphor', 'cannabis', 'caramel', 'champagne', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clay', 'cocacola', 'coconut', 'coffee', 'conifer', 'earthy', 'floral', 'fresh', 'fresh spicy', 'fruity', 'green', 'herbal', 'honey', 'iris', 'lactonic', 'lavender', 'leather', 'marine', 'metallic', 'mineral', 'mossy', 'musky', 'nutty', 'oriental', 'oud', 'ozonic', 'patchouli', 'powdery', 'rose', 'rum', 'salty', 'sand', 'savory', 'smoky', 'soapy', 'soft spicy', 'sour', 'spicy', 'sweet', 'tobacco', 'tropical', 'tuberose', 'vanilla', 'vinyl', 'violet', 'vodka', 'warm spicy', 'whiskey', 'white floral', 'wine', 'woody', 'yellow floral']


73

## Create Random User Queries

In [None]:
import random

def generate_random_query(notes, accords, occasions):
    note1, note2 = random.sample(notes, 2)
    accord = random.choice(accords)
    occasion = random.choice(occasions)
    return f"I want something {accord} with {note1} and {note2} {occasion}."


In [None]:
num_queries = 50
query_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/user_queries.csv"

# Check if file exists
if os.path.exists(query_path):
    print(f"Loading existing user_queries...")
    query_df = pd.read_csv(query_path)

else:
    print("No user_queries, needs to generating new queries...")

    # Generate new queries
    generated_queries = [generate_random_query(notes, accords, occasions) for _ in range(num_queries)]
    generated_queries = list(set(generated_queries))  # remove duplicates

    query_df = pd.DataFrame({"query": generated_queries})
    query_df.to_csv(query_path, index=False)

    print(f"Saved {num_queries} user queries to {query_path}")

query_df.head()


Loading existing user_queries...


Unnamed: 0,query
0,I want something sand with cherry and hiacynth...
1,I want something tobacco with coriander and pl...
2,I want something anis with tobacco blossom and...
3,I want something cannabis with magnolia petals...
4,I want something iris with juniper and hazelnu...


## Build Feedback File

In [None]:
feedback_path = "/content/drive/MyDrive/Colab Notebooks/totallymakescents/data/testing/user_feedback_with_tags.csv"

if not os.path.exists(feedback_path):
    with open(feedback_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["query", "perfume_name", "brand", "score", "label", "notes_and_accords"])
    print("Created new feedback file.")
else:
    print("Feedback file already exists — ready to append new rows.")

def recommend_perfumes_with_tags_and_feedback(user_query, top_k=5, max_feedback=3):
    """
    Recommend perfumes based on tags.
    Ask user Yes/No feedback
    """

    tags_text = generate_tags(user_query)
    # print(f"\nUser Query: {user_query}")
    print(f"Generated Tags:\n{tags_text}\n")

    query_embedding = s_model.encode(tags_text, convert_to_tensor=True)
    scent_tensor = perfume_embeddings.to(query_embedding.device)

    similarities = util.cos_sim(query_embedding, scent_tensor)[0]

    adjusted_scores = []
    for idx, score in enumerate(similarities):
        sentiment_boost = 1.2 if combined_df_classify_reviews.loc[idx, 'is_positive'] == 1 else 0.8
        adjusted_scores.append(score.item() * sentiment_boost)

    adjusted_scores = torch.tensor(adjusted_scores)
    top_results = torch.topk(adjusted_scores, k=top_k)

    # Only display a number of user queries until reaching max_feedback
    feedback_count = 0
    new_feedback = []
    for score, idx in zip(top_results.values, top_results.indices):
        if feedback_count >= max_feedback:
            break

        idx = idx.item()
        perfume = combined_df_classify_reviews.loc[idx]
        perfume_name = perfume['Perfume']
        brand = perfume['Brand']

        print(f"\n{perfume_name} by {brand} (Adjusted Score: {score:.3f})")
        short_desc = (
            f"Top Notes: {perfume['Top']}. "
            f"Middle Notes: {perfume['Middle']}. "
            f"Base Notes: {perfume['Base']}.\n"
            f"Main Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}."
        )
        print(f"{short_desc}\n")
        print(f"Sentiment: {perfume['sentiment']}")
        print("-" * 70)

        while True:
          label = input(f"Do you like '{perfume_name}' by {brand}? (Yes/No): ").strip().capitalize()
          if label in ["Yes", "No"]:
              break

          elif label == "Skip":
              print(f"Skipping '{perfume_name}' for query: {user_query}")
              label = None
              break
          else:
              print("Please type 'Yes' or 'No'. Try again.")

        notes_and_accords = (
            f"Top: {perfume['Top']} | "
            f"Middle: {perfume['Middle']} | "
            f"Base: {perfume['Base']} | "
            f"Accords: {', '.join([str(perfume.get(f'mainaccord{i}', '')) for i in range(1, 6)])}"
        )

        if label is not None:
            new_feedback.append([
                user_query,
                perfume_name,
                brand,
                score.item(),
                label,
                notes_and_accords
            ])
            feedback_count += 1

    # Append new feedback to CSV
    if new_feedback:
        with open(feedback_path, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(new_feedback)
        print(f"\nLogged {len(new_feedback)} new feedback entries.")
    else:
        print("\nℹNo new feedback was logged.")


Feedback file already exists — ready to append new rows.


In [None]:
# Loop through queries starting from a requested index
while True:
    start_index_input = input("Enter the query index: ").strip()
    if start_index_input == "" or start_index_input.isdigit():
        break
    else:
        print("Please enter valid number.")

start_index = int(start_index_input) - 1 if start_index_input else 0

for i, row in query_df.iloc[start_index:].iterrows():
    user_query = row['query']
    print(f"\n==============================")
    print(f"Query {i+1}/{len(query_df)}: {user_query}")
    print(f"==============================")

    recommend_perfumes(user_query, top_k=3, max_feedback=3)

    cont = input("Press Enter to continue, type 'skip' to skip this query, or 'stop' to quit: ").strip().lower()

    if cont == 'stop':
        print("Stopping early.")
        break
    elif cont == 'skip':
        print(f"Skipping query {i+1}: {user_query}")
        continue


Query 1/50: I want something sand with cherry and hiacynth for beach days.

User Query: I want something sand with cherry and hiacynth for beach days.
Generated Tags:
sulfur, sharp / acrid
coal smoke, sooty / dense
ambergris, musky / marine
old paper, musty / dry
leather, rich / worn
petrichor, earthy / fresh
engine oil, metallic / mechanical
burnt sugar, bitter / sweet
ozone, sharp / clean
cedar, woody / grounding
spiced vanilla, sweet / warming
candle wax, warm / fatty
honey, sweet / floral
wet bark, woody / humid
pine smoke, woody / smoky
sea breeze, salty / marine
jasmine, floral / ethere

Lux Visionaria by Filippo Sorcinelli (Adjusted Score: 0.870)
Top Notes: myrrh, incense, vanilla, patchouli, saffron, amber. Middle Notes: precious woods, ambrette (musk mallow), jasmine, lily-of-the-valley. Base Notes: smoke, artemisia, ylang-ylang.
Main Accords: amber, smoky, woody, warm spicy, balsamic.

Sentiment: POSITIVE
----------------------------------------------------------------------

## Compute Recall Scores

In [None]:
feedback_df = pd.read_csv(feedback_path)

feedback_df["label"] = feedback_df["label"].str.lower().str.strip()

# Group feedback by query
grouped = feedback_df.groupby("query")

recall_scores = []
for query, group in grouped:
    total = len(group)
    relevant = sum(group["label"] == "yes")
    recall = relevant / total if total > 0 else 0
    recall_scores.append(recall)

# Average all ratio of yes
avg_recall = sum(recall_scores) / len(recall_scores)
print(f"Average fraction of relevant items in top-k: {avg_recall:.2f} over {len(recall_scores)} queries")


Average fraction of relevant items in top-k: 0.67 over 5 queries
