In [1]:
# Import needed libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
from langchain_ollama import ChatOllama




## Load Data

In [6]:
# Import cleaned dataset downloaded from Kaggle
frag = pd.read_csv('fra_cleaned.csv', encoding="Windows-1252", sep=";")
frag.head()

Unnamed: 0,url,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,Perfumer1,Perfumer2,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,https://www.fragrantica.com/perfume/xerjoff/ac...,accento-overdose-pride-edition,xerjoff,Italy,unisex,142,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",unknown,,rose,woody,fruity,aromatic,floral
1,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2024,jean-paul-gaultier,France,women,186,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",unknown,,citrus,white floral,sweet,fresh,musky
2,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2023,jean-paul-gaultier,France,unisex,191,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",natalie gracia-cetto,quentin bisch,citrus,white floral,sweet,fresh spicy,musky
3,https://www.fragrantica.com/perfume/bruno-bana...,pride-edition-man,bruno-banani,Germany,men,192,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",unknown,,fruity,nutty,woody,tropical,
4,https://www.fragrantica.com/perfume/jean-paul-...,le-male-pride-collector,jean-paul-gaultier,France,men,193,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",francis kurkdjian,,aromatic,warm spicy,fresh spicy,cinnamon,vanilla


In [7]:
# Clean the columns
frag.columns = frag.columns.str.lower()
frag.columns = frag.columns.str.replace(" ", "_")

# Change the perfume names into title case
frag['perfume'] = frag['perfume'].str.replace("-", " ").str.title()

# Change the brand names into title case
frag['brand'] = frag['brand'].str.replace("-", " ").str.title()

# Replace , with . in rating_value column
frag['rating_value'] = frag['rating_value'].str.replace(",", ".")

# Chhange rating count column to string
frag['rating_count'] = frag['rating_count'].astype(str)

# Remove decimal points and change year column to string
frag['year'] = frag['year'].astype(str).str.replace(".0", "", regex=False)

# Change perfumer names to title case
frag['perfumer1'] = frag['perfumer1'].str.title()
frag['perfumer2'] = frag['perfumer2'].str.title()

## Data Preprocessing

In [8]:
# Concatenate basic info into fragrance description column for modeling
frag['full_description'] = "Name: " + frag['perfume'] + ". House: " + frag['brand'] + ". Country: " + frag['country'] + "."
frag['full_description'] += " Gender: " + frag['gender'] + ". Rating: " + frag['rating_value'] + " from " + frag['rating_count'] + " ratings."

# Add year into the fragrance description
for index, year in enumerate(frag['year']):
    if year == 'nan':
        sentence = " Year: Unknown."
    else:
        sentence = " Year: " + year + "."
    frag.loc[index, 'full_description'] += sentence

# Add fragrance notes into the description
frag['full_description'] += " Top Notes: " + frag['top'] + ". Middle Notes: " + frag['middle'] + ". Base Notes: " + frag['base'] + "."

# Add perfumers to the fragrance description
for index, row in frag.iterrows():
    perfumer1 = row['perfumer1']
    perfumer2 = row['perfumer2']
    
    if perfumer1 == 'Unknown':
        sentence = " Perfumer: Unknown."
    elif pd.isna(perfumer2) or perfumer2 == 'NaN':
        sentence = " Perfumer: " + perfumer1 + "."
    else:
        sentence = " Perfumer: " + perfumer1 + " and " + perfumer2 + "."
    
    frag.loc[index, 'full_description'] += sentence

# Add fragrance accords into the description
for index, row in frag.iterrows():
    accords = []
    for acc in ['mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']:
        if pd.notna(row[acc]) and row[acc] != 'NaN':
            accords.append(row[acc])
    
    if accords:
        sentence = " Accords: " + ", ".join(accords) + "."
    else:
        sentence = " Accords: Unknown."
    
    frag.loc[index, 'full_description'] += sentence

In [55]:
# Prepare texts and metadata
texts = frag['full_description'].tolist()
metadata = frag[['perfume', 'brand', 'country', 'gender', 'top', 'middle', 'base', 'year', 
                 'rating_value', 'rating_count', 'perfumer1', 'perfumer2', 'mainaccord1', 
                 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']].copy()

# Combine perfumer1 and perfumer2 into a single column 'perfumer'
metadata['perfumer'] = metadata.apply(
    lambda row: f"{row['perfumer1']} and {row['perfumer2']}" if pd.notna(row['perfumer1']) and pd.notna(row['perfumer2']) 
    else (row['perfumer1'] if pd.notna(row['perfumer1']) else row['perfumer2']), axis=1)

# Combine mainaccord1 through mainaccord5 into a single column 'accord'
metadata['accord'] = metadata[['mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']].apply(
    lambda x: ', '.join([str(val) if val is not None else '' for val in x]), axis=1)
metadata['accord'] = metadata['accord'].replace(r'^\s*$', 'No accords', regex=True)

# Drop the original individual perfumer and accord columns
metadata = metadata.drop(columns=['perfumer1', 'perfumer2', 'mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5'])
metadata.head()

Unnamed: 0,perfume,brand,country,gender,top,middle,base,year,rating_value,rating_count,perfumer,accord
0,Accento Overdose Pride Edition,Xerjoff,Italy,unisex,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",2022,1.42,201,Unknown,"rose, woody, fruity, aromatic, floral"
1,Classique Pride 2024,Jean Paul Gaultier,France,women,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",2024,1.86,70,Unknown,"citrus, white floral, sweet, fresh, musky"
2,Classique Pride 2023,Jean Paul Gaultier,France,unisex,"blood orange, yuzu","neroli, orange blossom","musk, white woods",2023,1.91,285,Natalie Gracia-Cetto and Quentin Bisch,"citrus, white floral, sweet, fresh spicy, musky"
3,Pride Edition Man,Bruno Banani,Germany,men,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",2019,1.92,59,Unknown,"fruity, nutty, woody, tropical, nan"
4,Le Male Pride Collector,Jean Paul Gaultier,France,men,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",2020,1.93,632,Francis Kurkdjian,"aromatic, warm spicy, fresh spicy, cinnamon, v..."


## Model Building

In [10]:
# Encode descriptions
encoder = SentenceTransformer('paraphrase-mpnet-base-v2')
vectors = encoder.encode(texts, show_progress_bar=False, batch_size=512)

In [11]:
# Normalize the vectors for cosine similarity
faiss.normalize_L2(vectors)

# Build FAISS index for cosine similarity search
index = faiss.IndexFlatIP(vectors.shape[1])
index.add(vectors)

In [12]:
# Save the FAISS index
faiss.write_index(index, 'fragrance_faiss.index')

# Save metadata separately
metadata.reset_index(drop=True, inplace=True)
with open('fragrance_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

## Jupyter Notebook Testing

In [4]:
index = faiss.read_index('fragrance_faiss.index')
with open('fragrance_metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)
encoder = SentenceTransformer('paraphrase-mpnet-base-v2')

# Convert rating_values to numeric
if 'rating_value' in metadata.columns:
    metadata['rating_value'] = pd.to_numeric(
        metadata['rating_value'], 
        errors='coerce'
    )
    
query = input("Describe your ideal fragrance: ")
k = 2

valid_indices = metadata.index.tolist()

# Gets a brief explanation from Ollama for why this fragrance matches the user's query
def get_ollama_explanation(query, description, similarity):
    prompt = f"""
            A user is searching for a fragrance with this description: "{query}"
            
            One recommendation is:
            {description}
            
            The cosine similarity score between the user's query and this fragrance is {similarity:.3f}.
            
            Explain in 2-3 sentences, in plain English, why this fragrance was recommended based on the user's query and the similarity score.
            """
    response = llm.invoke(prompt)
    return response.content.strip()

# Load Ollama
llm = ChatOllama(model="llama3.2")

# If no query, just return filtered results directly
if query.strip():
    filtered_vectors = np.vstack([index.reconstruct(int(idx)) for idx in valid_indices])
    
    # Normalize the vectors for cosine similarity (this is equivalent to using cosine similarity)
    faiss.normalize_L2(filtered_vectors)
    
    # Use IndexFlatIP for cosine similarity search
    temp_index = faiss.IndexFlatIP(filtered_vectors.shape[1])
    temp_index.add(filtered_vectors)

    # Encode the query and normalize it for cosine similarity
    query_vector = encoder.encode([query])
    faiss.normalize_L2(query_vector)
    
    # Perform the search and returns indices of the most similar vectors and their similarity scores
    sim_score, I = temp_index.search(query_vector, min(k, len(valid_indices)))

    # Get the recommened fragrance's indices and similarity score
    results = [(valid_indices[i], sim_score[0][j]) for j, i in enumerate(I[0])]
else:
    # If no query, just return the top k filtered results
    results = [(idx, None) for idx in valid_indices[:k]]

for idx, (result_idx, sim_score) in enumerate(results):
    rec = metadata.loc[result_idx]

    # Extract data with fallbacks
    name = rec.get('perfume', 'Unknown')
    brand = rec.get('brand', 'Unknown')
    perfumer_text = rec.get('perfumer', 'Unknown')
    top_notes = rec.get('top', 'Unknown')
    middle_notes = rec.get('middle', 'Unknown')
    base_notes = rec.get('base', 'Unknown')
    accords_text = rec.get('accord', 'Unknown')
    rating = rec.get('rating_value', '?')

     # Create natural language fragrance description
    description = (
        f"The fragrance is called {name}. It is by {brand}. "
        f"The perfumer is {perfumer_text}. The top notes are {top_notes}, "
        f"the heart notes are {middle_notes}, and the base notes are {base_notes}. "
        f"The main accords are {accords_text}."
    )

    explanation = get_ollama_explanation(query, description, sim_score)
    print(description)
    print(sim_score)
    print(explanation)

Describe your ideal fragrance:  I’m looking for a unisex fragrance that’s suitable for both work and casual outings. It should be fresh, not too overpowering, and have a clean scent. Please show me options that fit these criteria.


The fragrance is called Clean Fragrance. It is by Clean. The perfumer is Unknown. The top notes are lime, pink grapefruit, orange, bergamot, orange blossom, wild berries, the heart notes are lavender, lily, violet, jasmine, passion flower, damask rose, and the base notes are white musk, geranium, heliotrope. The main accords are citrus, fresh spicy, aromatic, white floral, powdery.
0.6186379
The Clean Fragrance by Clean was recommended because its fresh and citrusy scent profile aligns with the user's search criteria of a "fresh" and "clean" fragrance. Although the perfume itself has many notes, including floral and powdery accords that might be considered overpowering to some, the citrus dominance suggests it fits the user's desire for something "not too overpowering". The similarity score of 0.619 indicates a moderate match between the user's query and this fragrance, suggesting it's likely to meet their expectations but not a perfect fit.
The fragrance is called Fragrance 04. It is 

## Streamlit Implementation

In [4]:
code = '''
import streamlit as st
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import pickle
from langchain_ollama import ChatOllama

# Fragrance card function
def create_fragrance_card(name, rating, brand, perfumer_text, top_notes, middle_notes, base_notes, accords_text, explanation):
    # Create fragrance card HTML
    card_html = f"""
        <div style="border: 1px solid #ddd; padding: 15px; margin: 10px; border-radius: 15px; 
                    background: linear-gradient(to bottom right, #ffffff, #f2f6fc); 
                    width: 400px; color: #222; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
            <h3 style="color: #3a3a3a; text-align: center;">{name} ⭐{rating}</h3>
            <p><strong>🏷️ Brand:</strong> {brand}</p>
            <p><strong>👃 Perfumer(s):</strong> {perfumer_text}</p>
            <p><strong>🌿 Top Notes:</strong> {top_notes}</p>
            <p><strong>💖 Heart Notes:</strong> {middle_notes}</p>
            <p><strong>🌲 Base Notes:</strong> {base_notes}</p>
            <p><strong>🎼 Main Accords:</strong> {accords_text}</p>
            <p><strong>💡 AI Explanation:</strong> {explanation}</p>
        </div>
    """
    
    return card_html

# Load FAISS database, metadata, and encoder with cache
@st.cache_resource
def load_resources():
    index = faiss.read_index('fragrance_faiss.index')
    with open('fragrance_metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    encoder = SentenceTransformer('paraphrase-mpnet-base-v2')
    return index, metadata, encoder

# Gets a brief explanation from Ollama for why this fragrance matches the user's query
def get_ollama_explanation(query, description):
    prompt = f"""
            A user is searching for a fragrance with this description: "{query}"
            
            One recommendation is:
            {description}
            
            Explain in 1-2 sentences, in plain English, why this fragrance matches the user's query.
            """
    response = llm.invoke(prompt)
    return response.content.strip()

# Load Ollama
llm = ChatOllama(model="llama3.2")

# Initialize app
st.set_page_config(page_title="Fragrance Recommendation System", layout="wide")

# Add title to top of app interface
st.title("Fragrance Recommendation System")

# Sidebar filters
st.sidebar.header("Filters")
query = st.text_input("Describe your ideal fragrance:")

col1, col2 = st.columns(2)
with col1:
    k = st.slider("Number of recommendations:", 1, 10, 5)
with col2:
    min_rating = st.slider("Minimum rating:", 1.0, 5.0, 3.5)

gender_filter = st.sidebar.selectbox("Gender:", ["All", "Male", "Female", "Unisex"])
brand_filter = st.sidebar.text_input("Brand (leave empty for all):", "").title()
note_filter = st.sidebar.text_input("Notes (comma-separated):", "").lower()

# Load resources
index, metadata, encoder = load_resources()

# Convert rating_values to numeric
if 'rating_value' in metadata.columns:
    metadata['rating_value'] = pd.to_numeric(
        metadata['rating_value'], 
        errors='coerce')

# Press button and start recommendations
if st.button('Get Recommendations'):
    with st.spinner('Finding your fragrance recs...'):
        if query == "":
            st.warning("No query entered.")
        else:
            # Apply filters sequentially
            current_df = metadata.copy()
            
            # Gender filter
            if gender_filter != "All":
                current_df = current_df[current_df['gender'].str.lower() == gender_filter.lower()]
            
            # Brand filter
            if brand_filter:
                current_df = current_df[current_df['brand'].str.contains(brand_filter, case=False, na=False)]
            
            # Rating filter (with NaN handling)
            if 'rating_value' in current_df.columns:
                current_df = current_df[current_df['rating_value'].ge(min_rating)]
            
            # Note filter
            if note_filter:
                notes = [n.strip().lower() for n in note_filter.split(",")]
                def note_check(row):
                    note_fields = [
                        str(row['top']).lower() if pd.notna(row['top']) else "",
                        str(row['middle']).lower() if pd.notna(row['middle']) else "",
                        str(row['base']).lower() if pd.notna(row['base']) else ""
                    ]
                    return any(note in field for note in notes for field in note_fields)
                
                current_df = current_df[current_df.apply(note_check, axis=1)]
            
            valid_indices = current_df.index.tolist()
            
            # Check if any fragrances remain
            if not valid_indices:
                st.warning("No fragrances match all your filters. Try relaxing some criteria.")
                st.stop()

            # Grab the vectors for fragrances still present after the filters
            filtered_vectors = np.vstack([index.reconstruct(int(idx)) for idx in valid_indices])
            temp_index = faiss.IndexFlatIP(filtered_vectors.shape[1])
            temp_index.add(filtered_vectors)
        
            # Encode the query and normalize it for cosine similarity
            query_vector = encoder.encode([query])
            faiss.normalize_L2(query_vector)
            
            # Perform the search and returns indices of the most similar vectors and their similarity scores
            sim_score, I = temp_index.search(query_vector, min(k, len(valid_indices)))

            # Get the recommened fragrance's indices and similarity score
            results = [(valid_indices[i], sim_score[0][j]) for j, i in enumerate(I[0])]
    
            # Display results
            st.subheader(f"Recommended Fragrances ({len(results)} results)")
            cols = st.columns(3)
            
            for idx, (result_idx, sim_score) in enumerate(results):
                rec = metadata.loc[result_idx]
    
                # Extract data with fallbacks
                name = rec.get('perfume', 'Unknown')
                brand = rec.get('brand', 'Unknown')
                perfumer_text = rec.get('perfumer', 'Unknown')
                top_notes = rec.get('top', 'Unknown')
                middle_notes = rec.get('middle', 'Unknown')
                base_notes = rec.get('base', 'Unknown')
                accords_text = rec.get('accord', 'Unknown')
                rating = rec.get('rating_value', '?')
    
                 # Create natural language fragrance description
                description = (
                    f"The fragrance is called {name}. It is by {brand}. "
                    f"The perfumer is {perfumer_text}. The top notes are {top_notes}, "
                    f"the heart notes are {middle_notes}, and the base notes are {base_notes}. "
                    f"The main accords are {accords_text}."
                )
    
                explanation = get_ollama_explanation(query, description)
                
                # Add rating to card
                card = create_fragrance_card(
                        name,
                        rating,
                        brand, 
                        perfumer_text, 
                        top_notes,
                        middle_notes, 
                        base_notes, 
                        accords_text,
                        explanation
                    )
                cols[idx % 3].markdown(card, unsafe_allow_html=True)
'''

with open("fragrance_recommendation_app.py", "w", encoding="utf-8") as f:
    f.write(code)

In [3]:
!streamlit run fragrance_recommendation_app.py

^C
