# Scraping Fragrantica Notes

Here, we scrape the data about perfume notes from Fragrantica (https://www.fragrantica.com/notes/). 

The final output is a file containing all notes, image links, what fragrance group they belong to (e.g. all citrus notes are in a larger CITRUS SMELLS group), as well as a descriptions for each fragrance group.



In [3]:
from selenium import webdriver                  # for browser automation
from splinter import Browser
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys # for button presses
from selenium.webdriver.common.by import By     # for finding elements

import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
service = Service(GeckoDriverManager().install())

with Browser("firefox", service=service, headless=False) as browser:
    browser.visit("https://www.fragrantica.com/notes/")
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

In [9]:
notes = []

for i in range(1,14):
    group_name = soup.find('div',id=f'groupnotes_group_{str(i).zfill(2)}_title').h2.text.strip().lower()
    description = soup.find('div', id=f'descnotes_group_{str(i).zfill(2)}_title').text.strip()
    
    grid = soup.find_all('div',class_='grid-x grid-margin-y grid-margin-x')[i-1]

    for note in grid:
        note_name = note.img['alt'].lower()
        note_img = note.img['src']
        notes.append((group_name,description,note_name,note_img))


In [10]:
notes_df = pd.DataFrame(data=notes, columns=['note_group','group_description','note','note_img'])
notes_df.sample(5)

Unnamed: 0,note_group,group_description,note,note_img
149,"fruits, vegetables and nuts",Fruity notes beyond citrus (which form a class...,hog plum,https://fimgs.net/mdimg/sastojci/m.782.jpg?173...
711,"greens, herbs and fougeres","By the term ""green"" we refer to notes of snapp...",ivy,https://fimgs.net/mdimg/sastojci/m.192.jpg?173...
224,"fruits, vegetables and nuts",Fruity notes beyond citrus (which form a class...,shea butter,https://fimgs.net/mdimg/sastojci/m.1142.jpg?17...
1318,beverages,Fragrances often recreate the scent of popular...,blackcurrant juice,https://fimgs.net/mdimg/sastojci/m.1750.jpg?17...
69,"fruits, vegetables and nuts",Fruity notes beyond citrus (which form a class...,bearberry,https://fimgs.net/mdimg/sastojci/m.344.jpg?173...


In [11]:
notes_df.to_csv("notes.csv")

# Scraping descriptions from FindAScent

In [None]:
# In this cell we search FindAScent.com's perfume notes,
# which are organized alphabetically in different pages,
# and dump the htmls in a list to parse through later.

from string import ascii_lowercase
service = Service(GeckoDriverManager().install())

# List to hold html's
soups = []

# Loop through perfume note pages alphabetically
# and save html to soups list
for char in ascii_lowercase:
    if char in ['x','z']:   # no notes for x or z
        continue
    with Browser("firefox", service=service, headless=False) as browser:
        browser.visit(f"https://www.findascent.com/notes/{char}/")
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        soups.append(soup)

In [None]:
# In this cell we gather the note names and descriptions
# from the the list of htmls we gathered

# Create dataframe to hold scraped info
notes_and_descs = pd.DataFrame(columns=['note','desc'])

# Gather perfume note names and short descriptions from the htmls
# and add them to dataframe
for soup in soups:
    names = [tag.text for tag in soup.find_all('div',class_="fbox-content px-0") ]
    descs = [tag.text for tag in soup.find_all('div',class_='description') ]
    for name, desc in zip(names,descs):
        notes_and_descs = pd.concat([pd.DataFrame([[name,desc]], columns=notes_and_descs.columns), notes_and_descs], ignore_index=True)

In [None]:
names = [tag.text for tag in soup.find_all('div',class_="fbox-content px-0") ]
names = map(str.lower,names)
names = list(map(lambda x : x.replace(' ','-'), names))
char = names[0][0]
for name in names:
    with Browser("firefox", service=service, headless=False) as browser:
        browser.visit(f"https://www.findascent.com/notes/{char}/{name}.php")
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        soups.append(soup)

In [None]:
# Here we use the perfume note names we already got
# and go to their individual pages to get longer descriptions

# list of scraped perfume note names
names = list(notes_and_descs['note'].values)


for name in names:
    formatted_name = name.lower().replace(' ','-')  # reformat for http
    char = formatted_name[0]
    with Browser("firefox", service=service, headless=True) as browser:
        browser.visit(f'https://www.findascent.com/notes/{char}/{formatted_name}.php')
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        desc = soup.find('p').text
        notes_and_descs = pd.concat([pd.DataFrame([[name,desc]], columns=notes_and_descs.columns), notes_and_descs], ignore_index=True)

In [80]:
notes_and_descs

Unnamed: 0,note,desc
0,Absinthe,"Absinthe, often characterized by its green hue..."
1,Acacia Honey,"Acacia Honey, often derived from the blossoms ..."
2,Acai Berry,"Acai Berry, derived from the Acai palm tree na..."
3,Acajou Wood,"Acajou Wood, also known as African Mahogany (K..."
4,Adoxal,Adoxal is a synthetic compound widely utilized...
...,...,...
1799,Adoxal,"Adoxal infuses fragrances with a fresh, oceani..."
1800,Acajou Wood,"Acajou Wood, or African Mahogany, enriches fra..."
1801,Acai Berry,"Acai Berry adds a sweet, exotic, and vibrant f..."
1802,Acacia Honey,"Acacia Honey provides a sweet, floral, and war..."


In [81]:
notes_and_descs.to_csv('findascent_note_descriptions.csv')