# Scraper les informations d'une référence parfum sur Fragrantica

## Imports

In [1]:
from bs4 import BeautifulSoup
import re
import time
import random

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import numpy as np
import pandas as pd

## Les fonctions

### Fonction Main

In [None]:
def scraping_multi_perfume_info (list_url):

    all_data = [] 

    #✅ Configuration Selenium
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Désactiver l'ouverture de Chrome
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--blink-settings=imagesEnabled=false") # Pas besoin d'images = gain de temps
    options.add_argument("--headless=new")  # Nouvelle version plus rapide du headless
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")

    
    for url in list_url :
        # 2) Création du driver

        driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options)

        # 3) Récupération du contenu HTML
        driver.get(url)
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        driver.quit()

        # On appelle la fonction qui fait le scraping pour UNE URL
        perfume_info = scrape_perfume_info(html_content, soup)
        # On ajoute le résultat (dict) à la liste all_data
        all_data.append(perfume_info)

    # Convertir la liste de dicts en DataFrame
    all_data_df = pd.DataFrame(all_data)
    return all_data_df
        

### La fonction qui scrape toutes les infos d'un parfum à partir de son html_content et son soup

In [8]:
def scrape_perfume_info(html_content, soup):
    """
    Scrap les infos principales depuis la page Fragrantica d'un parfum.
    Retourne un dictionnaire avec ces informations.
    """
   
    # 2) Extraire les données (nom, marque, etc.)
    votes_dict = extract_all_votes(soup)

    perfume_data = {
        "nom_parfum": extract_perfume_name(html_content),
        "marque": extract_brand_name(html_content),
        "nose": extract_nose(soup),
        "launch_year": extract_launch_year(soup),
        "rating_value": extract_rating(soup),
        "rating_count": extract_rating_count(soup),
        "main_accords": extract_main_accords(soup),
        "gender": extract_gender(votes_dict),
        "longevity": extract_longevity(votes_dict),
        "sillage": extract_sillage(votes_dict),
        "price_feeling": extract_price_feeling(votes_dict),    
        "top_notes": extract_pyramid_ingredients(soup, "Top Notes"),
        "middle_notes": extract_pyramid_ingredients(soup, "Middle Notes"),
        "base_notes": extract_pyramid_ingredients(soup, "Base Notes")
    }

    # 3) Retourner le dictionnaire
    return perfume_data

    

### Fonctions intermediaires pour récuperer les infos sur un parfum

In [9]:
# &) Intermédiare d'intermédiaire
def extract_all_votes(soup):
    """
    Extrait tous les votes de la page sous forme d'un dictionnaire.
    
    :param soup: Objet BeautifulSoup contenant le HTML.
    :return: Dictionnaire {'category': votes}
    """
    votes_dict = {}
    seen_moderate = 0

    # Sélectionner tous les blocs contenant les votes
    vote_rows = soup.select('div.grid-x.grid-margin-x')

    for row in vote_rows:
        category_el = row.select_one('span.vote-button-name')
        votes_el = row.select_one('span.vote-button-legend')

        if category_el and votes_el:
            category = category_el.get_text(strip=True)
            votes = votes_el.get_text(strip=True)

            if category == "moderate":
                    seen_moderate += 1
                    if seen_moderate == 2:
                        category = "average"

            # Vérifier si le vote est bien un nombre    
            if votes.isdigit():
                votes_dict[category] = int(votes)
            

    return votes_dict

# 1) Nom du parfum
def extract_perfume_name(html_content):
    """Extrait le nom du parfum depuis le contenu HTML"""
    match = re.search(r'/perfume/[^/]+/([^/]+)-\d+\.html', html_content)
    return match.group(1).replace('-', ' ') if match else None

# 2) Marque
def extract_brand_name(html_content):
    """Extrait la marque du parfum depuis le contenu HTML"""
    match = re.search(r'/perfume/([^/]+)/', html_content)
    return match.group(1).replace('-', ' ') if match else None

#3) Parfumeur
def extract_nose(soup):
    """Extrait le nom du parfumeur"""
    nose_el = soup.select_one('div.cell a[href^="/noses/"]')
    return nose_el.get_text(strip=True) if nose_el else None

#4) Année de sortie
def extract_launch_year(soup):
    """Extrait l'année de sortie du parfum"""
    launch_year = soup.title.text.strip()[-4:] if soup.title else ""
    return launch_year if launch_year.isdigit() else None

#5) Perfume rating
def extract_rating(soup):
    """Extrait la note du parfum"""
    rating_el = soup.select_one('span[itemprop="ratingValue"]')
    return rating_el.get_text(strip=True) if rating_el else None

#6) Nombre de votes
def extract_rating_count(soup):
    """Extrait le nombre de votes"""
    rating_count_el = soup.select_one('span[itemprop="ratingCount"]')
    return rating_count_el.get_text(strip=True) if rating_count_el else None

#7) Accords principaux
def extract_main_accords(soup):
    """Extrait les accords principaux du parfum"""
    main_accords_el = soup.find_all('div', class_='cell accord-box')
    return [element.get_text(strip=True) for element in main_accords_el if element.get_text(strip=True)]

#8) Genre
def extract_gender(votes_dict):
    """Détermine le genre du parfum en fonction des votes extraits."""
    gender_labels = ["female", "more female", "unisex", "more male", "male"]
    gender_votes = [votes_dict.get(label, 0) for label in gender_labels]
    
    if sum(gender_votes) <= 8:
        return None

    female_count = gender_votes[0] + gender_votes[1]
    male_count = gender_votes[3] + gender_votes[4]
    unisex_count = 1.2 * gender_votes[2]

    return max(("female", female_count),("male", male_count),("unisex", unisex_count),key=lambda x: x[1])[0]

#9) Longévité
def extract_longevity(votes_dict):
    """Détermine la longévité dominante en fonction des votes."""
    longevity_labels = ["very weak", "weak", "moderate", "long lasting", "eternal"]
    longevity_votes = [votes_dict.get(label, 0) for label in longevity_labels]

    if sum(longevity_votes) > 8:
        return longevity_labels[np.argmax(longevity_votes)]

    return None

#10) Sillage
def extract_sillage(votes_dict):
    """Détermine le sillage dominant en fonction des votes."""
    sillage_labels = ["intimate", "average", "strong", "enormous"]
    sillage_votes = [votes_dict.get(label, 0) for label in sillage_labels]

    if sum(sillage_votes) > 8:
        return sillage_labels[np.argmax(sillage_votes)]

    return None

#11) Perception du prix
def extract_price_feeling(votes_dict):
    """Détermine la perception du prix en fonction des votes."""
    price_labels = ["way overpriced", "overpriced", "ok", "good value", "great value"]
    price_votes = [votes_dict.get(label, 0) for label in price_labels]

    if sum(price_votes) > 8:
        return price_labels[np.argmax(price_votes)]

    return None

#12) Pyramide olfactive
## Par sous partie de la pyramide (notes de tête, de coeur, de fond)
def extract_pyramid_ingredients(soup, pyramid_section):
    """Extrait les ingrédients d'une section de la pyramide olfactive"""
    header = soup.find('h4', string=lambda text: text and pyramid_section in text)
    if not header:
        return []

    # Chercher directement tous les <a> après le header
    div = header.find_next('div')
    return list({a_tag.next_sibling.strip() for a_tag in div.find_all('a') if a_tag.next_sibling}) if div else []


## Le script

In [10]:
urls = [
        "https://www.fragrantica.com/perfume/By-Kilian/Black-Phantom-43632.html",
        "https://www.fragrantica.com/perfume/Mancera/Wild-Leather-28084.html",
        "https://www.fragrantica.com/perfume/Ministry-of-Oud/Oud-Satin-74588.html",
        "https://www.fragrantica.com/perfume/Pana-Dora/Onyx-Black-103091.html",
        "https://www.fragrantica.com/perfume/Nasomatto/Sadonaso-80076.html",
        "https://www.fragrantica.com/perfume/Maison-Crivelli/Patchouli-Magnetik-71759.html"
    ]

df = scraping_multi_perfume_info(urls)
df.head()

Unnamed: 0,nom_parfum,marque,nose,launch_year,rating_value,rating_count,main_accords,gender,longevity,sillage,price_feeling,top_notes,middle_notes,base_notes
0,Black Phantom,By Kilian,Sidonie Lancesseur,2017,4.1,7946.0,"[sweet, warm spicy, caramel, coffee, chocolate...",unisex,long lasting,average,overpriced,[Rum],"[Almond, Dark Chocolate, Heliotrope, Coffee]","[Tonka, Sugar Cane, Sandalwood, Caramel, Vanil..."
1,Wild Leather,Mancera,Pierre Montale,2014,3.78,610.0,"[woody, leather, earthy, mossy, animalic, patc...",male,long lasting,strong,ok,[Sicilian Bergamot],"[Violet, Bulgarian Rose, Patchouli]","[Guaiac Wood, Oakmoss, Leather, White Musk, Am..."
2,Oud Satin,Ministry of Oud,,2021,4.23,456.0,"[rose, violet, vanilla, powdery, oud, floral, ...",unisex,long lasting,strong,great value,"[Agarwood (Oud), Bulgarian Rose]","[Benzoin, Turkish Rose]","[Vanilla, Violet]"
3,Onyx Black,Pana Dora,Ibrahim Al Zoabi,2025,,,"[amber, woody, warm spicy, fresh spicy, earthy...",,,,,"[Cinnamon, Saffron, Lemon, Bergamot, Nutmeg]","[Olibanum, Ambergris, Cypriol]","[Cedar, Fir Resin, Labdanum]"
4,Sadonaso,Nasomatto,Alessandro Gualtieri,2023,2.96,1706.0,"[vanilla, musky, animalic, powdery, warm spicy...",unisex,long lasting,average,ok,[Coffee],"[Tobacco, Musk, Sandalwood]","[Tonka, Amber, Vanilla, Animal notes]"


In [12]:
parfums_liens = pd.read_csv('../parfums_database.csv')

parfums_liens

Unnamed: 0,Marque,Parfum,Lien du Parfum
0,Acqua di Parma,Acqua di Parma Colonia,https://www.fragrantica.com/perfume/Acqua-di-P...
1,Acqua di Parma,Acqua di Parma Colonia Assoluta,https://www.fragrantica.com/perfume/Acqua-di-P...
2,Acqua di Parma,Acqua di Parma Colonia Assoluta Edizione Riviera,https://www.fragrantica.com/perfume/Acqua-di-P...
3,Acqua di Parma,Acqua di Parma Colonia Assoluta Edizione Speci...,https://www.fragrantica.com/perfume/Acqua-di-P...
4,Acqua di Parma,Acqua di Parma Colonia Designer Edition,https://www.fragrantica.com/perfume/Acqua-di-P...
...,...,...,...
19313,Zoologist Perfumes,Dodo Edition 2020,https://www.fragrantica.com/perfume/Zoologist-...
19314,Zoologist Perfumes,Dragonfly,https://www.fragrantica.com/perfume/Zoologist-...
19315,Zoologist Perfumes,Macaque,https://www.fragrantica.com/perfume/Zoologist-...
19316,Zoologist Perfumes,Panda,https://www.fragrantica.com/perfume/Zoologist-...


In [15]:
df_filtre_A = parfums_liens[parfums_liens['Marque'].str.startswith('A', na=False)]

# Extraire les liens en liste
liste_liens_A = df_filtre_A['Lien du Parfum'].tolist()

# Afficher la liste
print(liste_liens_A)
print(len(liste_liens_A))

['https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-1681.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-Assoluta-1682.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-Assoluta-Edizione-Riviera-1680.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-Assoluta-Edizione-Speciale-2011-12532.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-Designer-Edition-13806.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-Edizione-Centenario-42460.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Colonia-Pura-45876.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Gelsomino-Nobile-12921.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Iris-Nobile-1564.html', 'https://www.fragrantica.com/perfume/Acqua-di-Parma/Acqua-di-Parma-Iris-Nobile-Edizione-S

In [16]:
df_A = scraping_multi_perfume_info(liste_liens_A[:100])
df_A.head()

Unnamed: 0,nom_parfum,marque,nose,launch_year,rating_value,rating_count,main_accords,gender,longevity,sillage,price_feeling,top_notes,middle_notes,base_notes
0,Acqua di Parma Colonia,Acqua di Parma,,1916.0,4.15,3500,"[citrus, aromatic, fresh spicy, woody, lavende...",male,moderate,average,ok,[],[],[]
1,Acqua di Parma Colonia Assoluta,Acqua di Parma,Bertrand Duchaufour,2003.0,4.22,1167,"[citrus, woody, aromatic, earthy, fresh spicy,...",male,moderate,average,ok,"[Lemon Verbena, Sweet Orange, Bergamot, Bitter...","[Paprika, Pink Pepper, Ylang-Ylang, Jasmine, C...","[White Musk, Patchouli, Resins, Oakmoss]"
2,Acqua di Parma Colonia Assoluta Edizione Riviera,Acqua di Parma,Bertrand Duchaufour,2007.0,3.43,14,"[citrus, woody, aromatic, fresh spicy, rose, s...",,,average,,"[Bergamot, Orange, Sicilian Lemon]","[Lemon Verbena, Lavender, Bulgarian Rose]","[Rosemary, Cedar, Sandalwood, Ylang-Ylang]"
3,Acqua di Parma Colonia Assoluta Edizione Speci...,Acqua di Parma,,,3.67,9,"[aromatic, fresh spicy, lavender, white floral...",,,strong,,[],[],[]
4,Acqua di Parma Colonia Designer Edition,Acqua di Parma,,2011.0,4.08,12,"[citrus, amber, fresh spicy, fresh, aromatic, ...",,,strong,,"[Rosemary, Lavender, Citruses]","[Rose, Jasmine]","[Amber, Musk]"


In [19]:
df_A.to_csv('df_a.csv', index=False)