### Document embedding

In [1]:
import pandas as pd

#### load supermarket dataset

In [2]:
df = pd.read_csv('./short_raw_supermarket.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   category_id  457 non-null    object 
 1   internal_id  457 non-null    int64  
 2   description  457 non-null    object 
 3   format       452 non-null    object 
 4   weight       299 non-null    object 
 5   brand        457 non-null    object 
 6   units        457 non-null    int64  
 7   image_loc    457 non-null    object 
 8   price_date   457 non-null    object 
 9   price        457 non-null    float64
 10  pricebase    330 non-null    float64
 11  promotion    395 non-null    object 
 12  source_id    457 non-null    int64  
 13  source_name  457 non-null    object 
dtypes: float64(2), int64(3), object(9)
memory usage: 50.1+ KB


In [3]:
col_names = df.columns[[2, 3, 4, 5, 6, 8, 9, 13]]

In [4]:
df[col_names]

Unnamed: 0,description,format,weight,brand,units,price_date,price,source_name
0,FONT VELLA AGUA MINERAL NATURAL BOTELLA 33 CL,33 CL,8 L,FONT VELLA,1,2023-03-12,0.95,DIA
1,LANJARON AGUA MINERAL NATURAL BOTELLA TAPÓN SP...,33 CL,330ML,LANJARON,1,2023-03-12,0.65,DIA
2,"AGUA MINERAL NATURAL KIDS ZOO LANJARON, BOTELL...",33 CL,,LANJARON,1,2023-03-12,0.75,CAPRABO
3,FONTAREL ZERO SODIO AGUA MINERAL NATURAL SIN G...,"1,50 L",,FONTAREL,1,2023-03-20,0.65,ECI
4,PERRIER AGUA MINERAL CON GAS BOTELLA 50 CL,50 CL,1 L,PERRIER,1,2023-03-12,1.13,DIA
...,...,...,...,...,...,...,...,...
452,"ACEITE LABIAL GINSENG 2 ESSENCE, PACK 1 UD",G 2,,ESSENCE,1,2023-03-08,2.99,EROSKI
453,BARRA DE LABIOS COLOR SENSATIONAL 211 MAYBELLI...,"4,2 G",,MAYBELLINE,1,2023-03-08,5.99,EROSKI
454,"AGUA DE COLONIA AGUA FRESCA ALVAREZ GÓMEZ, SPR...",BOTELLA,750 ML,ALVAREZ GÓMEZ,1,2023-03-12,6.89,CAPRABO
455,"HERVIDOR DE AGUA TAURUS VINTAGE RED, CAPACIDAD...",50 CL,5 L,TAURUS VINTAGE RED,1,2023-03-12,29.90,ALCAMPO


In [5]:
df['description'] = df.description.astype('str')

#### preprocess data

In [6]:
from gensim.utils import simple_preprocess

In [7]:
preprocessed_data = df.description.apply(simple_preprocess)
preprocessed_data

0      [font, vella, agua, mineral, natural, botella,...
1      [lanjaron, agua, mineral, natural, botella, ta...
2      [agua, mineral, natural, kids, zoo, lanjaron, ...
3      [fontarel, zero, sodio, agua, mineral, natural...
4        [perrier, agua, mineral, con, gas, botella, cl]
                             ...                        
452         [aceite, labial, ginseng, essence, pack, ud]
453    [barra, de, labios, color, sensational, maybel...
454    [agua, de, colonia, agua, fresca, alvarez, góm...
455    [hervidor, de, agua, taurus, vintage, red, cap...
456               [aceite, en, crema, oliva, mítica, ml]
Name: description, Length: 457, dtype: object

#### tag documents

In [8]:
from gensim.models.doc2vec import TaggedDocument

In [9]:
tagged_data = [TaggedDocument(words = words, tags = [str(i)]) for i, words in enumerate(preprocessed_data)]
tagged_data[0]

TaggedDocument(words=['font', 'vella', 'agua', 'mineral', 'natural', 'botella', 'cl'], tags=['0'])

#### Doc2Vec model

In [10]:
from gensim.models.doc2vec import Doc2Vec

In [11]:
d2v = Doc2Vec(dm = 0, vector_size = 10, window = 20, alpha = 0.025, workers =4)
d2v

<gensim.models.doc2vec.Doc2Vec at 0x7f6cbef68430>

- dm: training algorithm (0: PV-DBOW, 1: PV-DM)
- vector_size: dimension of the feature vectors
- window: max. distance between related words in a sentence
- alpha: learning rate
- min_count: ignore words with frequency lower than this
- max_vocab_size: unique words (10M words~ 1GB, prune infrequent words if not enough memory)

In [12]:
d2v.vector_size, d2v.window, d2v.alpha, d2v.workers, d2v.dm

(10, 20, 0.025, 4, False)

#### build document corpus

In [13]:
d2v.build_vocab(tagged_data)

In [14]:
d2v.corpus_count, d2v.corpus_total_words

(457, 3432)

#### train model

In [15]:
%%time
d2v.train(tagged_data, total_examples =d2v.corpus_count, epochs = 50)

CPU times: user 616 ms, sys: 12.4 ms, total: 629 ms
Wall time: 622 ms


#### word vectors and document vectors

In [16]:
d2v.wv, d2v.dv

(<gensim.models.keyedvectors.KeyedVectors at 0x7f6cbef682b0>,
 <gensim.models.keyedvectors.KeyedVectors at 0x7f6cbef68310>)

In [17]:
len(d2v.wv), len(d2v.dv)

(145, 457)

In [18]:
d2v.wv['botella'], d2v.dv['0']

(array([-0.0960355 ,  0.05007293, -0.08759586, -0.04391825, -0.000351  ,
        -0.00296181, -0.0766124 ,  0.09614743,  0.04982058,  0.09233143],
       dtype=float32),
 array([ 0.37936682,  0.32132193, -0.19728471,  0.14251503,  0.0678161 ,
         0.06681497, -0.21811473, -0.6779954 , -0.4435215 ,  0.23385447],
       dtype=float32))

#### inference

In [19]:
d2v.infer_vector(preprocessed_data[0])

array([ 0.39481533,  0.4397277 , -0.13165095,  0.05953737, -0.01857865,
        0.09581095, -0.09298267, -0.7296168 , -0.31638792,  0.20975013],
      dtype=float32)

#### retrieve similar sentences

In [20]:
query = d2v.infer_vector(['agua', 'mineral', 'carrefour', 'con', 'limón', 'pomelo'])
mss = d2v.dv.most_similar(query, topn =5)
preprocessed_data[10], mss

(['agua', 'mineral', 'carrefour', 'con', 'limón', 'pomelo'],
 [('3', 0.9950123429298401),
  ('255', 0.9940841794013977),
  ('198', 0.9902231097221375),
  ('81', 0.9894995093345642),
  ('18', 0.9886177182197571)])

In [21]:
df.iloc[[int(index) for index, similarity in mss], ]

Unnamed: 0,category_id,internal_id,description,format,weight,brand,units,image_loc,price_date,price,pricebase,promotion,source_id,source_name
3,B00-01-30,60913,FONTAREL ZERO SODIO AGUA MINERAL NATURAL SIN G...,"1,50 L",,FONTAREL,1,https://scrape.codeworks.es/imgs/10/0011863000...,2023-03-20,0.65,,5€ DE DESCUENTO POR COMPRA SUPERIOR A 20€. INT...,10,ECI
255,D40-41-02,13036,NESCAFÉ CLASSIC NATURAL - CAFÉ SOLUBLE 100G,100 G,200 G,NESCAFÉ,1,https://scrape.codeworks.es/imgs/7/156906_3.jpg,2023-03-12,4.3,7.89,70% DESCUENTO EN LA 2ª UNIDAD,7,CAPRABO
198,D30-20-02,9693,ACEITE DE GIRASOL Y SOJA CON OMEGA 3 LA ESPAÑO...,1 L,1000ML,LA ESPAÑOLA,1,https://scrape.codeworks.es/imgs/6/89524459274...,2023-03-14,4.33,,,6,ALCAMPO
81,B90-00-10,3358,ZUMO DE MANZANA DON SIMON BRICK DE 1 L.,1 L,600ML,DON SIMON,1,https://scrape.codeworks.es/imgs/6/88695537664...,2023-03-12,1.44,2.29,OFERTA VÁLIDA HASTA EL 09/04/2023,6,ALCAMPO
18,B00-03-12,70351,AGUA MINERAL CON GAS PERRIER SABOR MELOCOTÓN 5...,50 CL,,PERRIER,1,https://scrape.codeworks.es/imgs/2/288883_00_1...,2023-05-15,0.95,,,2,CARREFOUR\n


#### check

In [22]:
i = 10
preprocessed_data[i], d2v.dv[i]

(['agua', 'mineral', 'carrefour', 'con', 'limón', 'pomelo'],
 array([ 0.35393962,  0.32939717, -0.11443932,  0.09357431, -0.02630672,
         0.0212159 , -0.13578933, -0.4613815 , -0.23472516,  0.221645  ],
       dtype=float32))

In [23]:
query = d2v.dv[i]
d2v.dv.most_similar(query, topn = 1)

[('10', 0.9999999403953552)]

In [24]:
query = d2v.infer_vector(['agua', 'mineral', 'carrefour', 'con', 'limón', 'pomelo'])
query

array([ 0.41957334,  0.3971932 , -0.04735447, -0.03832476,  0.00390296,
        0.07548583, -0.12911806, -0.56673425, -0.26900694,  0.21374774],
      dtype=float32)

In [25]:
d2v.dv.most_similar(query, topn = 1)

[('255', 0.9953171610832214)]