In [45]:
'''
import data
'''

import pandas as pd

url = "https://raw.githubusercontent.com/kavya-desai-ds/fragrance-recommendation-system/14a59749980f3b3a3cadaa24c2a4adcbe7408282/fragrance_dataset_raw.csv"
df = pd.read_csv(url, encoding="latin1", sep=";")
df.head()

Unnamed: 0,url,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,Perfumer1,Perfumer2,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,https://www.fragrantica.com/perfume/xerjoff/ac...,accento-overdose-pride-edition,xerjoff,Italy,unisex,142,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",unknown,,rose,woody,fruity,aromatic,floral
1,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2024,jean-paul-gaultier,France,women,186,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",unknown,,citrus,white floral,sweet,fresh,musky
2,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2023,jean-paul-gaultier,France,unisex,191,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",natalie gracia-cetto,quentin bisch,citrus,white floral,sweet,fresh spicy,musky
3,https://www.fragrantica.com/perfume/bruno-bana...,pride-edition-man,bruno-banani,Germany,men,192,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",unknown,,fruity,nutty,woody,tropical,
4,https://www.fragrantica.com/perfume/jean-paul-...,le-male-pride-collector,jean-paul-gaultier,France,men,193,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",francis kurkdjian,,aromatic,warm spicy,fresh spicy,cinnamon,vanilla


In [46]:
'''
clean data
- drop unnecessary columns
- combine main accords 1 to 5 into one column (because they are in no particular order and not all rows have all 5)
- rename columns
- create one general column for all attributes
- fix the perfume names and brands columns
'''

#df.isna().sum()
df.drop(['Rating Value', 'Rating Count', 'Country', 'Year', 'Perfumer1', 'Perfumer2'], axis=1, inplace=True)

main_accord_cols = ['mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']
df['Main Accords'] = df[main_accord_cols].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)
df.drop(main_accord_cols, axis=1, inplace=True)

#df.columns
new_col_names = ["Fragrantica URL", "Perfume Name", "Brand", "Gender", "Top Notes", "Middle Notes", "Base Notes", "Main Accords"]
df.columns = new_col_names

def clean_name(text):
    if pd.isna(text):
        return text
    text = text.replace("-", " ")
    return text.title()

attributes = ['Top Notes', 'Middle Notes', 'Base Notes', 'Main Accords']
df['All Attributes'] = df[attributes].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)
df.head()
df["Perfume Name"] = df["Perfume Name"].apply(clean_name)
df["Brand"] = df["Brand"].apply(clean_name)
df.head()

Unnamed: 0,Fragrantica URL,Perfume Name,Brand,Gender,Top Notes,Middle Notes,Base Notes,Main Accords,All Attributes
0,https://www.fragrantica.com/perfume/xerjoff/ac...,Accento Overdose Pride Edition,Xerjoff,unisex,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine","rose, woody, fruity, aromatic, floral","fruity notes, aldehydes, green notes, bulgaria..."
1,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2024,Jean Paul Gaultier,women,"yuzu, citruses","orange blossom, neroli","musk, blonde woods","citrus, white floral, sweet, fresh, musky","yuzu, citruses, orange blossom, neroli, musk, ..."
2,https://www.fragrantica.com/perfume/jean-paul-...,Classique Pride 2023,Jean Paul Gaultier,unisex,"blood orange, yuzu","neroli, orange blossom","musk, white woods","citrus, white floral, sweet, fresh spicy, musky","blood orange, yuzu, neroli, orange blossom, mu..."
3,https://www.fragrantica.com/perfume/bruno-bana...,Pride Edition Man,Bruno Banani,men,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber","fruity, nutty, woody, tropical","guarana, grapefruit, red apple, walnut, lavend..."
4,https://www.fragrantica.com/perfume/jean-paul-...,Le Male Pride Collector,Jean Paul Gaultier,men,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean","aromatic, warm spicy, fresh spicy, cinnamon, v...","mint, lavender, cardamom, artemisia, bergamot,..."


In [31]:
'''
vector time :D

'''

import sentence_transformers as sbert
model = sbert.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(df['All Attributes'])


KeyboardInterrupt: 

In [None]:
import numpy as np

embeddings_np = np.array(embeddings).astype('float32')
dim = embeddings_np.shape[1]

import faiss

faiss.normalize_L2(embeddings_np)

index = faiss.IndexFlatIP(dim)  # IP = inner product
index.add(embeddings_np)

print("Number of vectors in index:", index.ntotal)

In [None]:
metadata = df[['Perfume Name', 'Brand', 'Main Accords']].reset_index(drop=True)

In [35]:
def recommend_by_attribute(query_text, k=5):
    query_embedding = model.encode([query_text]).astype('float32')
    faiss.normalize_L2(query_embedding)

    distances, indices = index.search(query_embedding, k)

    return metadata.iloc[indices[0]]

In [33]:
def recommend_by_perf(brand, perf_name, k=5):
    mask = (
        df['Brand'].str.lower().str.strip() == brand.lower().strip()
    ) & (
        df['Perfume Name'].str.lower().str.strip() == perf_name.lower().strip()
    )

    if not mask.any():
        raise ValueError(f"Perfume '{perf_name}' by '{brand}' not found.")

    query_text = df.loc[mask, 'All Attributes'].iloc[0]

    query_embedding = model.encode([query_text]).astype('float32')
    faiss.normalize_L2(query_embedding)

    distances, indices = index.search(query_embedding, k + 1)

    result_indices = [
        i for i in indices[0]
        if not (
            metadata.iloc[i]['Brand'].lower().strip() == brand.lower().strip()
            and metadata.iloc[i]['Perfume Name'].lower().strip() == perf_name.lower().strip()
        )
    ][:k]

    return metadata.iloc[result_indices]

In [None]:
brand = input("Enter brand: ")
perfume_name = input("Enter perfume name: ")

recommendations = recommend_by_perf(brand, perfume_name, k=5)
recommendations