In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy import stats
import statsmodels.api as sm


In [51]:
# read in data
frag_data = pd.read_csv('./data/fra_cleaned.csv')

frag_data


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 24136: invalid continuation byte

In [56]:
# # read in data with explicit encoding to handle special characters
# frag_data = pd.read_csv('./data/fra_cleaned.csv', encoding='latin-1')

# # alternatively, try these encodings if latin-1 doesn't work:
# # frag_data = pd.read_csv('./data/fra_cleaned.csv', encoding='iso-8859-1') 
# # frag_data = pd.read_csv('./data/fra_cleaned.csv', encoding='cp1252')

# Try reading with a different delimiter and error handling
frag_data = pd.read_csv('./data/fra_cleaned.csv', 
                        encoding='latin-1',
                        sep=None, # Let pandas detect the separator
                        engine='python', # More flexible but slower engine
                        on_bad_lines='skip') # Warn about skipped rows

# Display info about the dataframe structure
print("\nDataframe Info:")
print(frag_data.info())

# Display first few rows to check parsing
print("\nFirst few rows:")
print(frag_data.head())



Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24063 entries, 0 to 24062
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           24063 non-null  object 
 1   Perfume       24063 non-null  object 
 2   Brand         24063 non-null  object 
 3   Country       24063 non-null  object 
 4   Gender        24063 non-null  object 
 5   Rating Value  24063 non-null  object 
 6   Rating Count  24063 non-null  int64  
 7   Year          22026 non-null  float64
 8   Top           24063 non-null  object 
 9   Middle        24063 non-null  object 
 10  Base          24063 non-null  object 
 11  Perfumer1     24063 non-null  object 
 12  Perfumer2     1336 non-null   object 
 13  mainaccord1   24063 non-null  object 
 14  mainaccord2   24050 non-null  object 
 15  mainaccord3   23949 non-null  object 
 16  mainaccord4   23675 non-null  object 
 17  mainaccord5   23082 non-null  object 
dtypes: float6

In [57]:
frag_data

Unnamed: 0,url,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,Perfumer1,Perfumer2,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,https://www.fragrantica.com/perfume/xerjoff/ac...,accento-overdose-pride-edition,xerjoff,Italy,unisex,142,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",unknown,,rose,woody,fruity,aromatic,floral
1,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2024,jean-paul-gaultier,France,women,186,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",unknown,,citrus,white floral,sweet,fresh,musky
2,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2023,jean-paul-gaultier,France,unisex,191,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",natalie gracia-cetto,quentin bisch,citrus,white floral,sweet,fresh spicy,musky
3,https://www.fragrantica.com/perfume/bruno-bana...,pride-edition-man,bruno-banani,Germany,men,192,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",unknown,,fruity,nutty,woody,tropical,
4,https://www.fragrantica.com/perfume/jean-paul-...,le-male-pride-collector,jean-paul-gaultier,France,men,193,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",francis kurkdjian,,aromatic,warm spicy,fresh spicy,cinnamon,vanilla
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24058,https://www.fragrantica.com/perfume/o-boticari...,floratta,o-boticario,Brazil,women,481,31,1992.0,"ozonic notes, peach","lily-of-the-valley, jasmine","musk, amber",thierry lecoule,,white floral,ozonic,fresh,musky,powdery
24059,https://www.fragrantica.com/perfume/sultan-pas...,cheval-d-arabie,sultan-pasha-attars,UK,unisex,481,26,2020.0,"taif rose, indian oud, white rose, olibanum, e...","bulgarian rose, indian oud, hay, leather, cive...","white amber, leather, hay, elemi resin, kyara ...",sultan pasha,,rose,amber,fresh spicy,animalic,aromatic
24060,https://www.fragrantica.com/perfume/darkbeat-p...,khaox,darkbeat-parfums,Spain,unisex,485,26,2020.0,"mint, lime, rum","lily-of-the-valley, fig leaf, cardamom","musk, oakmoss, amber, patchouli",josé m. giraldo,,green,aromatic,citrus,fresh spicy,woody
24061,https://www.fragrantica.com/perfume/parfumerie...,aoud-no-1,parfumerie-bruckner,Germany,unisex,486,28,2011.0,"apple, peach, saffron","plum, orange blossom, jasmine","agarwood (oud), sandalwood, vanilla",unknown,,fruity,sweet,oud,woody,powdery


In [58]:
# list all columns
frag_data.columns

Index(['url', 'Perfume', 'Brand', 'Country', 'Gender', 'Rating Value',
       'Rating Count', 'Year', 'Top', 'Middle', 'Base', 'Perfumer1',
       'Perfumer2', 'mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4',
       'mainaccord5'],
      dtype='object')

In [59]:
# drop url Perfumer1 Perfumer2
frag_data_selected = frag_data.drop(columns=['url', 'Perfumer1', 'Perfumer2'])
frag_data_selected


Unnamed: 0,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,accento-overdose-pride-edition,xerjoff,Italy,unisex,142,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral
1,classique-pride-2024,jean-paul-gaultier,France,women,186,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",citrus,white floral,sweet,fresh,musky
2,classique-pride-2023,jean-paul-gaultier,France,unisex,191,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky
3,pride-edition-man,bruno-banani,Germany,men,192,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",fruity,nutty,woody,tropical,
4,le-male-pride-collector,jean-paul-gaultier,France,men,193,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24058,floratta,o-boticario,Brazil,women,481,31,1992.0,"ozonic notes, peach","lily-of-the-valley, jasmine","musk, amber",white floral,ozonic,fresh,musky,powdery
24059,cheval-d-arabie,sultan-pasha-attars,UK,unisex,481,26,2020.0,"taif rose, indian oud, white rose, olibanum, e...","bulgarian rose, indian oud, hay, leather, cive...","white amber, leather, hay, elemi resin, kyara ...",rose,amber,fresh spicy,animalic,aromatic
24060,khaox,darkbeat-parfums,Spain,unisex,485,26,2020.0,"mint, lime, rum","lily-of-the-valley, fig leaf, cardamom","musk, oakmoss, amber, patchouli",green,aromatic,citrus,fresh spicy,woody
24061,aoud-no-1,parfumerie-bruckner,Germany,unisex,486,28,2011.0,"apple, peach, saffron","plum, orange blossom, jasmine","agarwood (oud), sandalwood, vanilla",fruity,sweet,oud,woody,powdery


In [60]:
# in Rating Value column replace , with .
frag_data_selected['Rating Value'] = frag_data_selected['Rating Value'].str.replace(',', '.')
frag_data_selected


Unnamed: 0,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,accento-overdose-pride-edition,xerjoff,Italy,unisex,1.42,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral
1,classique-pride-2024,jean-paul-gaultier,France,women,1.86,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",citrus,white floral,sweet,fresh,musky
2,classique-pride-2023,jean-paul-gaultier,France,unisex,1.91,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky
3,pride-edition-man,bruno-banani,Germany,men,1.92,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",fruity,nutty,woody,tropical,
4,le-male-pride-collector,jean-paul-gaultier,France,men,1.93,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24058,floratta,o-boticario,Brazil,women,4.81,31,1992.0,"ozonic notes, peach","lily-of-the-valley, jasmine","musk, amber",white floral,ozonic,fresh,musky,powdery
24059,cheval-d-arabie,sultan-pasha-attars,UK,unisex,4.81,26,2020.0,"taif rose, indian oud, white rose, olibanum, e...","bulgarian rose, indian oud, hay, leather, cive...","white amber, leather, hay, elemi resin, kyara ...",rose,amber,fresh spicy,animalic,aromatic
24060,khaox,darkbeat-parfums,Spain,unisex,4.85,26,2020.0,"mint, lime, rum","lily-of-the-valley, fig leaf, cardamom","musk, oakmoss, amber, patchouli",green,aromatic,citrus,fresh spicy,woody
24061,aoud-no-1,parfumerie-bruckner,Germany,unisex,4.86,28,2011.0,"apple, peach, saffron","plum, orange blossom, jasmine","agarwood (oud), sandalwood, vanilla",fruity,sweet,oud,woody,powdery


In [61]:
# combine Perfume and Brand columns with "by"
frag_data_selected['full_perfume_name'] = frag_data_selected['Perfume'] + ' by ' + frag_data_selected['Brand']
frag_data_selected

# drop Brand column
frag_data_selected = frag_data_selected.drop(columns=['Brand'])
frag_data_selected

# drop Perfume column
frag_data_selected = frag_data_selected.drop(columns=['Perfume'])
frag_data_selected

# drop Gender column
frag_data_selected = frag_data_selected.drop(columns=['Gender'])
frag_data_selected

Unnamed: 0,Country,Rating Value,Rating Count,Year,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,full_perfume_name
0,Italy,1.42,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral,accento-overdose-pride-edition by xerjoff
1,France,1.86,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",citrus,white floral,sweet,fresh,musky,classique-pride-2024 by jean-paul-gaultier
2,France,1.91,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky,classique-pride-2023 by jean-paul-gaultier
3,Germany,1.92,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",fruity,nutty,woody,tropical,,pride-edition-man by bruno-banani
4,France,1.93,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla,le-male-pride-collector by jean-paul-gaultier
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24058,Brazil,4.81,31,1992.0,"ozonic notes, peach","lily-of-the-valley, jasmine","musk, amber",white floral,ozonic,fresh,musky,powdery,floratta by o-boticario
24059,UK,4.81,26,2020.0,"taif rose, indian oud, white rose, olibanum, e...","bulgarian rose, indian oud, hay, leather, cive...","white amber, leather, hay, elemi resin, kyara ...",rose,amber,fresh spicy,animalic,aromatic,cheval-d-arabie by sultan-pasha-attars
24060,Spain,4.85,26,2020.0,"mint, lime, rum","lily-of-the-valley, fig leaf, cardamom","musk, oakmoss, amber, patchouli",green,aromatic,citrus,fresh spicy,woody,khaox by darkbeat-parfums
24061,Germany,4.86,28,2011.0,"apple, peach, saffron","plum, orange blossom, jasmine","agarwood (oud), sandalwood, vanilla",fruity,sweet,oud,woody,powdery,aoud-no-1 by parfumerie-bruckner
