In [1]:
import numpy as np
import fasttext as ft
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
perfume_df = pd.read_csv("/Users/rura/Desktop/INONO/noon_perfumes_dataset.csv")
scent = perfume_df[["base_note", "scents"]]
scent

Unnamed: 0,base_note,scents
0,"Oakmoss, Patchouli, Vetiver",Woody
1,"Vanilla, Sandalwood, Patchouli",Floral
2,"Lemon, Mint, Wood Moss",Arabian
3,"Cashmere Wood, Moss, Rippled Sand Accord",Spicy
4,"Vanille, Benzoin, Tonka Bean",Arabian
...,...,...
998,"Coconut Accords, Wood, Musk",Citrus
999,"Musk, Vanilla",Fruity
1000,"Musk, Amber, Wood",Woody
1001,"Agarwood, Amber",Arabian


In [3]:
scent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 0 to 1002
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   base_note  1003 non-null   object
 1   scents     1003 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [5]:
#map categories into numbers by getting unique scent categories
scent_category = scent['scents'].unique()

#make dictionary of the category mappings
dict_map = {}
for x in range(len(scent_category)):
    dict_map[scent_category[x]] = x

for x in dict_map:
    print(x, ":", dict_map[x])

Woody : 0
Floral : 1
Arabian : 2
Spicy : 3
Oriental : 4
Fruity : 5
Fresh : 6
Citrus : 7
Vanilla : 8
Musk : 9
Oriental, Floral : 10
Sweet Aromatic : 11
Aromatic : 12
Woody, Sweet : 13
Aromatic,Citrus : 14
Clean : 15
Woody, Musky : 16
Floral and Oriental : 17
Jasmine : 18
Woody And Spicy : 19
Rose, Floral : 20
Sandalwood : 21


In [6]:
#new dataframe with all the data with an numerical category column
df_index = scent.copy()
df_index["category_index"] = scent.scents.map(dict_map)

#new dataframe with only product_title and category_index
df = df_index[['base_note', 'category_index']]
df.head(10)


Unnamed: 0,base_note,category_index
0,"Oakmoss, Patchouli, Vetiver",0
1,"Vanilla, Sandalwood, Patchouli",1
2,"Lemon, Mint, Wood Moss",2
3,"Cashmere Wood, Moss, Rippled Sand Accord",3
4,"Vanille, Benzoin, Tonka Bean",2
5,"Vanille, Benzoin, Tonka Bean",2
6,Roasted Cocoa,4
7,"Labdanum, Musk",5
8,Sandalwood,1
9,"Cistus, Myrrh, Oud, Amber",2


In [7]:
## using fasttext
#add __label__ in front of the labels for fastText to read
#df.iloc[:,1] #select category column
df_labeled = df.copy()
df_labeled['category_index'] = '__label__' + df_labeled['category_index'].astype(str)
df_labeled.head()

Unnamed: 0,base_note,category_index
0,"Oakmoss, Patchouli, Vetiver",__label__0
1,"Vanilla, Sandalwood, Patchouli",__label__1
2,"Lemon, Mint, Wood Moss",__label__2
3,"Cashmere Wood, Moss, Rippled Sand Accord",__label__3
4,"Vanille, Benzoin, Tonka Bean",__label__2


In [8]:
#put category and product_title together
category_prod = df_labeled['category_index'] + " " + df_labeled['base_note']
print(category_prod)

0                  __label__0 Oakmoss, Patchouli, Vetiver
1               __label__1 Vanilla, Sandalwood, Patchouli
2                       __label__2 Lemon, Mint, Wood Moss
3       __label__3 Cashmere Wood, Moss, Rippled Sand A...
4                 __label__2 Vanille, Benzoin, Tonka Bean
                              ...                        
998                __label__7 Coconut Accords, Wood, Musk
999                              __label__5 Musk, Vanilla
1000                         __label__0 Musk, Amber, Wood
1001                           __label__2 Agarwood, Amber
1002    __label__0 Virginia Cedar, Leather, Suede, Vet...
Length: 1003, dtype: object


In [9]:
## GET RID OF LINES THAT DON'T START WITH __LABEL__
print(len(category_prod))
count = 0
for line in range(len(category_prod)-1, -1, -1):
    match = re.search(r'\d\d\d\d\d\d\d\d\d\d', category_prod[line])
    try:
        if not match.group() == "":
            category_prod = category_prod.drop(line)
            count+=1
    except:
        count += 0
print(count)
print(len(category_prod))

1003
0
1003


There is no product without a label

In [10]:
#split into train and test
train, test = train_test_split(category_prod, test_size=0.1, train_size=0.9, random_state=42)
#write test and train into files
f_train = open("train.txt", "a")
for i in range(len(np.array(train))):
    f_train.write(np.array(train)[i] + "\n")
f_train.close()

f_test = open("test.txt", "a")
for i in range(len(np.array(test))):
    f_test.write(np.array(test)[i] + "\n")
f_test.close()

In [11]:
#train model
model = ft.train_supervised(input="train.txt")

Read 0M words
Number of words:  485
Number of labels: 22
Progress:   0.0% words/sec/thread:       0 lr:  0.100000 avg.loss:  3.102830 ETA: 720h 0m 0s

RuntimeError: Encountered NaN.

In [26]:
#test using the model
model.test("test.txt") #(n, precision, recall)

(300, 0.30333333333333334, 0.30333333333333334)