In this notebook, we scrape the data about perfume notes from Fragrantica (https://www.fragrantica.com/notes/). 

The final output is a file containing all notes, image links, what fragrance group they belong to (e.g. all citrus notes are in a larger CITRUS SMELLS group), as well as a descriptions for each fragrance group.



In [None]:
from selenium import webdriver                  # for browser automation
from splinter import Browser
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys # for button presses
from selenium.webdriver.common.by import By     # for finding elements

import requests
import pandas as pd
from bs4 import BeautifulSoup


In [3]:
service = Service(GeckoDriverManager().install())

with Browser("firefox", service=service, headless=False) as browser:
    browser.visit("https://www.fragrantica.com/notes/")
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

In [9]:
notes = []

for i in range(1,14):
    group_name = soup.find('div',id=f'groupnotes_group_{str(i).zfill(2)}_title').h2.text.strip().lower()
    description = soup.find('div', id=f'descnotes_group_{str(i).zfill(2)}_title').text.strip()
    
    grid = soup.find_all('div',class_='grid-x grid-margin-y grid-margin-x')[i-1]

    for note in grid:
        note_name = note.img['alt'].lower()
        note_img = note.img['src']
        notes.append((group_name,description,note_name,note_img))


In [10]:
notes_df = pd.DataFrame(data=notes, columns=['note_group','group_description','note','note_img'])
notes_df.sample(5)

Unnamed: 0,note_group,group_description,note,note_img
149,"fruits, vegetables and nuts",Fruity notes beyond citrus (which form a class...,hog plum,https://fimgs.net/mdimg/sastojci/m.782.jpg?173...
711,"greens, herbs and fougeres","By the term ""green"" we refer to notes of snapp...",ivy,https://fimgs.net/mdimg/sastojci/m.192.jpg?173...
224,"fruits, vegetables and nuts",Fruity notes beyond citrus (which form a class...,shea butter,https://fimgs.net/mdimg/sastojci/m.1142.jpg?17...
1318,beverages,Fragrances often recreate the scent of popular...,blackcurrant juice,https://fimgs.net/mdimg/sastojci/m.1750.jpg?17...
69,"fruits, vegetables and nuts",Fruity notes beyond citrus (which form a class...,bearberry,https://fimgs.net/mdimg/sastojci/m.344.jpg?173...


In [11]:
notes_df.to_csv("notes.csv")