## Importamos las librerías

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tensorflow.keras import layers, models


### Cargamos csv

In [3]:
df = pd.read_csv("Libro3.csv", sep=';', encoding='latin1')

### Comenzamos EDA

**Hacemos un análisis de las columnas.**

In [4]:
df.head()

Unnamed: 0,Perfume,Brand,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,accento-overdose-pride-edition,xerjoff,unisex,142,201,2022,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",rose,woody,fruity,aromatic,floral
1,classique-pride-2024,jean-paul-gaultier,women,186,70,2024,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",citrus,white floral,sweet,fresh,musky
2,classique-pride-2023,jean-paul-gaultier,unisex,191,285,2023,"blood orange, yuzu","neroli, orange blossom","musk, white woods",citrus,white floral,sweet,fresh spicy,musky
3,pride-edition-man,bruno-banani,men,192,59,2019,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",fruity,nutty,woody,tropical,
4,le-male-pride-collector,jean-paul-gaultier,men,193,632,2020,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",aromatic,warm spicy,fresh spicy,cinnamon,vanilla


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12705 entries, 0 to 12704
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Perfume       12705 non-null  object
 1   Brand         12705 non-null  object
 2   Gender        12705 non-null  object
 3   Rating Value  12705 non-null  object
 4   Rating Count  12705 non-null  int64 
 5   Year          12705 non-null  int64 
 6   Top           12705 non-null  object
 7   Middle        12705 non-null  object
 8   Base          12705 non-null  object
 9   mainaccord1   12705 non-null  object
 10  mainaccord2   12700 non-null  object
 11  mainaccord3   12644 non-null  object
 12  mainaccord4   12507 non-null  object
 13  mainaccord5   12182 non-null  object
dtypes: int64(2), object(12)
memory usage: 1.4+ MB


Vemos que todas las columnas tiene valores de tipo 'objeto', a continuación comprobaremos los valores nulos.

In [6]:
df.isnull().sum()

Perfume           0
Brand             0
Gender            0
Rating Value      0
Rating Count      0
Year              0
Top               0
Middle            0
Base              0
mainaccord1       0
mainaccord2       5
mainaccord3      61
mainaccord4     198
mainaccord5     523
dtype: int64

Hay algunos valores nulos en los últimos acordes; para posteriormente hacer PCAs es necesario que no haya valores nulos, por lo que a estos imputaremos valores.

A la hora de imputar los valores imputar con valores como la mediana podría dar problemas en el anális posterior ya que este valor sería el de un aroma, lo que podría adulterar los datos sobre cada perfume, por lo que les daaremos el valor 'none' para que tras su posterior codificación tengan todos los valores nulos le mismo valor codificado.

In [None]:
accordcols = [c for c in df.columns if c.startswith("mainaccord")]
df[accordcols] = df[accordcols].fillna("none")


Definimos las columnas mainaccord y las guardamos en main_cols:

In [8]:
main_cols = [c for c in df.columns if c.startswith('mainaccord')]
print("Columnas detectadas:", main_cols)

Columnas detectadas: ['mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']


Codificamos las columnas aplicando la misma condición a todas las columnas, si en la columna 1 el aroma 'x' tiene valor 1, en el resto de columnas también lo será.

In [9]:

ordered = []
for c in main_cols:
    for v in df[c].dropna().astype(str):
        if v not in ordered:
            ordered.append(v)

mapping = {v: i for i, v in enumerate(ordered, start=1)}  
print("Número de categorías encontradas:", len(mapping))
print("Ejemplo de mapping (primeros 10):", dict(list(mapping.items())[:10]))

for c in main_cols:
    df[c] = df[c].map(mapping)


Número de categorías encontradas: 81
Ejemplo de mapping (primeros 10): {'rose': 1, 'citrus': 2, 'fruity': 3, 'aromatic': 4, 'white floral': 5, 'woody': 6, 'powdery': 7, 'leather': 8, 'ozonic': 9, 'vinyl': 10}


In [10]:
df['Top'].head()

0             fruity notes, aldehydes, green notes
1                                   yuzu, citruses
2                               blood orange, yuzu
3                   guarana, grapefruit, red apple
4    mint, lavender, cardamom, artemisia, bergamot
Name: Top, dtype: object

In [11]:

##Ahora vamos a normalizar las notas de top, middle, base
def normalize_notes(notes):
    if not isinstance(notes, str):
        return []
    return [n.strip().lower() for n in notes.split(',') if n.strip()]

print(df['Top'].head())

df['Top'] = df['Top'].apply(normalize_notes)
df['Middle'] = df['Middle'].apply(normalize_notes)
df['Base'] = df['Base'].apply(normalize_notes)

0             fruity notes, aldehydes, green notes
1                                   yuzu, citruses
2                               blood orange, yuzu
3                   guarana, grapefruit, red apple
4    mint, lavender, cardamom, artemisia, bergamot
Name: Top, dtype: object


In [12]:
##Contruimos vocabulario global de notas:

all_notes = set()

for col in ['Top', 'Middle', 'Base']:
    df[col].apply(lambda x: all_notes.update(x))

all_notes = sorted(all_notes)
print(len(all_notes))

1502


In [13]:
note_to_idx = {note: i for i, note in enumerate(all_notes)}
print(note_to_idx)

{'absinthe': 0, 'acai berry': 1, 'accord eudora©': 2, 'acerola': 3, 'acerola blossom': 4, 'acetylfuran': 5, 'ac\xa0cia': 6, 'african geranium': 7, 'african ginger': 8, 'african orange flower': 9, 'agarwood': 10, 'agarwood (oud)': 11, 'agave': 12, 'agave nectar': 13, 'aglaia': 14, 'akigalawood': 15, 'aldehydes': 16, 'algae': 17, 'allspice': 18, 'almond': 19, 'almond blossom': 20, 'almond cream': 21, 'almond milk': 22, 'almond tree': 23, 'aloe vera': 24, 'alpinia': 25, 'althaea': 26, 'alumroot': 27, 'amalfi lemon': 28, 'amaretto': 29, 'amaryllis': 30, 'amazon lily': 31, 'amber': 32, 'amber oil': 33, 'amber xtreme': 34, 'ambergris': 35, 'ambertonic': 36, 'amberwood': 37, 'ambrarome': 38, 'ambreine': 39, 'ambretone': 40, 'ambrette': 41, 'ambrette (musk mallow)': 42, 'ambrettolide': 43, 'ambrocenide': 44, 'ambrofix?': 45, 'ambrostar?': 46, 'ambrox super': 47, 'ambroxan': 48, 'amyl salicylate': 49, 'amyris': 50, 'angelica': 51, 'angels trumpet': 52, 'animal notes': 53, 'anise': 54, 'antillon

In [14]:
##Definimos pesos
WEIGHTS = {
    'top': 0.9,
    'middle': 0.7,
    'base': 0.4
}


In [15]:
#Vectorizamos perfumes

def build_weighted_perfume(row, note_to_idx, weights):
    vec = np.zeros(len(note_to_idx))

    for n in row['Top']:
        if n in note_to_idx:
            vec[note_to_idx[n]] += weights['top']

    for n in row['Middle']:
        if n in note_to_idx:
            vec[note_to_idx[n]] += weights['middle']
    
    for n in row['Base']:
        if n in note_to_idx:
            vec[note_to_idx[n]] += weights['base']

    return vec


In [16]:
##Creacion matriz completa de las notas

X_notes = np.vstack(df.apply(lambda row: build_weighted_perfume(row, note_to_idx, WEIGHTS), axis=1))

print(X_notes.shape)

(12705, 1502)


In [17]:
#Verificamos dimensiones
print(X_notes.shape)
print(df.shape)


(12705, 1502)
(12705, 14)


In [18]:
accord_cols = [
    'mainaccord1',
    'mainaccord2',
    'mainaccord3',
    'mainaccord4',
    'mainaccord5'
]


In [19]:
def normalize_accord(a):
    if pd.isna(a):
        return None
    return str(a).strip().lower()


In [20]:
for col in accord_cols:
    df[col] = df[col].apply(normalize_accord)


In [21]:
accord_vocab = set()

for col in accord_cols:
    accord_vocab.update(df[col].dropna().unique())

accord_vocab = sorted(accord_vocab)
len(accord_vocab), accord_vocab[:10]


(81, ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18'])

In [22]:
accord_to_idx = {a: i for i, a in enumerate(accord_vocab)}

In [23]:
#Ejemplo de pesos(relevancia) por acordes
ACCORD_WEIGHTS = {
    1: 1.0,
    2: 0.8,
    3: 0.8,
    4: 0.7,
    5: 0.5
}


In [24]:
def build_accord_vector(row, accord_to_idx, weights):
    vec = np.zeros(len(accord_to_idx))

    for i, col in enumerate(accord_cols, start=1):
        accord = row[col]
        if accord in accord_to_idx:
            vec[accord_to_idx[accord]] += weights[i ]

    return vec

In [25]:
X_accords = np.vstack(
    df.apply(
        lambda row: build_accord_vector(row, accord_to_idx, ACCORD_WEIGHTS),
        axis=1
    )
)

X_accords.shape


(12705, 81)

In [26]:
print(X_notes.shape, X_accords.shape)


(12705, 1502) (12705, 81)


In [27]:
##Concatenamos acordes y notas

X_total = np.hstack([X_notes, X_accords])
print(X_total.shape)

(12705, 1583)


In [46]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer(norm='l2')
X = normalizer.fit_transform(X_total)

name_to_idx = pd.Series(df.index.values, index=df["Perfume"].astype(str)).to_dict()

def topn_similarities(perfume_name, topn= 10, filtrar_genero = None, misma_marca = None):
    if perfume_name not in name_to_idx:
        raise ValueError(f"Perfume '{perfume_name}' no encontrado en el dataset.")
    
    i = name_to_idx[perfume_name]
    q = X[i].reshape(1, -1)

    sims = cosine_similarity(q, X).ravel()
    sims[i] = -1.0  # excluir el mismo perfume

    mask = np.ones(len(df), dtype=bool)

    if filtrar_genero is not None:
        # ejemplo: filtrar_genero = ["men", "unisex"]
        mask &= df["Gender"].isin(filtrar_genero).values
    
    if misma_marca is True:
        mask &= (df["Brand"].values == df.loc[i, "Brand"])
    elif isinstance(misma_marca, str):
        mask &= (df["Brand"].values == misma_marca)

    # aplica máscara
    valid_idx = np.where(mask)[0]
    valid_sims = sims[valid_idx]

    # topn sin ordenar todo
    k = min(topn, len(valid_idx))
    top_local = np.argpartition(-valid_sims, kth=k-1)[:k]
    top_idx = valid_idx[top_local]

    # orden final por score
    top_idx = top_idx[np.argsort(-sims[top_idx])]

    out = df.loc[top_idx, ["Perfume", "Brand", "Gender", "Year"]].copy()
    out["sim"] = sims[top_idx]
    return out.reset_index(drop=True)

In [47]:
topn_similarities("le-male-pride-collector", topn=10, filtrar_genero=["men","unisex"])


Unnamed: 0,Perfume,Brand,Gender,Year,sim
0,le-male-eau-de-toilette-airlines,jean-paul-gaultier,men,2020,0.958244
1,le-male-pirate-edition,jean-paul-gaultier,men,2015,0.958244
2,le-male-collector-edition-2017,jean-paul-gaultier,men,2017,0.937901
3,infinite-navy,caline,men,2022,0.910458
4,le-male-collector-s-snow-globe,jean-paul-gaultier,men,2019,0.895075
5,max,paris-elysees,men,2015,0.890962
6,le-male-couple,jean-paul-gaultier,men,2013,0.88651
7,le-male-x-mas-edition-2020,jean-paul-gaultier,men,2020,0.885439
8,le-male-collector-edition-2018,jean-paul-gaultier,men,2018,0.80621
9,evo,nuancielo,men,2018,0.727731


In [48]:
q = "le-male-pride-collector"
rec = topn_similarities(q, topn=10)
df[df["Perfume"].eq(q)][["Perfume","Brand","Top","Middle","Base","mainaccord1","mainaccord2","mainaccord3","mainaccord4","mainaccord5"]]
rec.merge(df[["Perfume","Brand","Top","Middle","Base","mainaccord1","mainaccord2","mainaccord3","mainaccord4","mainaccord5"]],
          on=["Perfume","Brand"], how="left")


Unnamed: 0,Perfume,Brand,Gender,Year,sim,Top,Middle,Base,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,le-male-eau-de-toilette-airlines,jean-paul-gaultier,men,2020,0.958244,"[mint, cardamom, lavender, bergamot, artemisia]","[cinnamon, orange blossom, caraway]","[vanilla, tonka bean, cedar, sandalwood, amber]",4,13,26,17,6
1,le-male-pirate-edition,jean-paul-gaultier,men,2015,0.958244,"[mint, lavender, cardamom, artemisia, bergamot]","[cinnamon, orange blossom, caraway]","[tonka bean, vanilla, sandalwood, amber, cedar]",4,13,26,17,16
2,le-male-collector-edition-2017,jean-paul-gaultier,men,2017,0.937901,"[lavender, mint, cardamom, artemisia, bergamot]","[cinnamon, orange blossom, caraway]","[vanilla, tonka bean, amber, sandalwood, cedar]",4,17,26,20,13
3,infinite-navy,caline,men,2022,0.910458,"[lavender, bergamot, cardamom, mint]","[cinnamon, caraway, cedar]","[vanilla, tonka bean, amber, sandalwood]",13,4,17,26,58
4,le-male-collector-s-snow-globe,jean-paul-gaultier,men,2019,0.895075,"[lavender, cardamom, mint, bergamot, artemisia]","[orange blossom, cinnamon, caraway]","[vanilla, tonka bean, sandalwood, amber, cedar]",4,17,13,15,6
5,max,paris-elysees,men,2015,0.890962,"[lavender, mint, artemisia, bergamot]","[orange blossom, cinnamon, cardamom, cumin]","[vanilla, sandalwood, amber, tonka bean, moss,...",4,13,17,26,6
6,le-male-couple,jean-paul-gaultier,men,2013,0.88651,"[lavender, mint, bergamot, artemisia, cardamom]","[orange blossom, cinnamon, caraway]","[vanilla, cedar, sandalwood, amber, tonka bean]",4,6,17,26,20
7,le-male-x-mas-edition-2020,jean-paul-gaultier,men,2020,0.885439,"[lavender, cardamom, mint, artemisia, bergamot]","[cinnamon, orange blossom, caraway]","[sandalwood, vanilla, tonka bean, cedar, amber]",13,4,6,17,20
8,le-male-collector-edition-2018,jean-paul-gaultier,men,2018,0.80621,"[lavender, artemisia, mint, cardamom, bergamot]","[cinnamon, orange blossom, cumin]","[vanilla, tonka bean, amber, sandalwood, cedar]",17,4,15,20,13
9,evo,nuancielo,men,2018,0.727731,"[apple, orange blossom, bergamot]","[cinnamon, cloves, lavender, geranium, cardamom]","[vanilla, sandalowood, tonka bean, vetiver, ce...",13,17,4,58,26
