In [None]:
import pandas as pd

In [None]:
csv_file_path = 'dataset.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

In [4]:
df.columns 

Index(['PROD_ID', 'PROD_NAME', 'PROD_BRAND', 'PROD_LINK', 'PROD_IMAGE_URL',
       'PROD_CATEGORY', 'PROD_PRICE', 'PROD_DESCRIPTION'],
      dtype='object')

Explore the PROD_DESCRIPTION column.

Lets see how many rows per length we have:

In [6]:
# Calculate the length distribution of the strings in the PROD_DESCRIPTION column
length_distribution = df['PROD_DESCRIPTION'].str.len().value_counts().sort_index(ascending=False)

# Display the length distribution
print(length_distribution)

2794    1
2791    1
2765    1
2610    1
2584    1
       ..
62      2
61      1
56      3
55      6
2       3
Name: PROD_DESCRIPTION, Length: 1126, dtype: int64


Lets group in ranges of 100 to see how many rows are there per group:

We can see that we have 23 rows with length between 0 and 100 characters, etc.

In [7]:
# Calculate the length of the strings in the PROD_DESCRIPTION column
lengths = df['PROD_DESCRIPTION'].str.len()

# Define the bins (ranges) for grouping
bins = range(0, max(lengths) + 100, 100)

# Group the lengths into the defined bins
grouped_lengths = pd.cut(lengths, bins=bins).value_counts().sort_index()

# Display the length distribution grouped by ranges of 100
print(grouped_lengths)

(0, 100]         23
(100, 200]       78
(200, 300]      217
(300, 400]      412
(400, 500]      546
(500, 600]      616
(600, 700]      617
(700, 800]      518
(800, 900]      443
(900, 1000]     305
(1000, 1100]    234
(1100, 1200]    175
(1200, 1300]    138
(1300, 1400]     72
(1400, 1500]     44
(1500, 1600]     36
(1600, 1700]     24
(1700, 1800]      8
(1800, 1900]     20
(1900, 2000]     14
(2000, 2100]      3
(2100, 2200]     14
(2200, 2300]      2
(2300, 2400]      1
(2400, 2500]      9
(2500, 2600]      1
(2600, 2700]      1
(2700, 2800]      3
Name: PROD_DESCRIPTION, dtype: int64


### SEMANTIC SEARCH USING EMBEDDINGS

To perform a semantic search using embeddings, we can use the 'Sentence Transformers' library to convert the product descriptions into embeddings and then search for similar descriptions. The Sentence Transformers library utilizes pre-trained models from the Hugging Face library to generate embeddings for sentences.

#### Load the necessary libraries

In [8]:

# If not available, we can install this library with the following command: pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Initialize a pre-trained Sentence Transformer model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

Downloading (…)7f4ef/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f279f7f4ef/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)79f7f4ef/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)279f7f4ef/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7f4ef/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)279f7f4ef/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)9f7f4ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

#### Calculate the embeddings for the PROD_DESCRIPTION column

This process may take a few minutes for our current dataset.  In my computer it took almost 3 minutes.

In [13]:
import os 
import numpy as np

descriptions = df['PROD_DESCRIPTION'].tolist()
embeddings_file = 'description_embeddings.npy'

# Check if the embeddings file exists, if not, calculate and save the embeddings
if not os.path.exists(embeddings_file):
    description_embeddings = model.encode(descriptions)
    np.save(embeddings_file, description_embeddings)
else:
    # Load the saved embeddings
    description_embeddings = np.load(embeddings_file)

### Ask our embeddings database!

In [14]:
# Define the query and calculate its embedding
query = "What are the products that include banana?"
query_embedding = model.encode([query])

In [15]:
# Compute cosine similarity between the query embedding and all description embeddings
similarities = cosine_similarity(query_embedding, description_embeddings)

In [18]:
# Find the indices of the top 5 most similar descriptions
top_n = 5
top_n_indices = similarities[0].argsort()[-top_n:][::-1]
print(top_n_indices)

[1728 3802 3937 3701 1926]


In [19]:
# Print the top 5 most similar product descriptions
for index in top_n_indices:
    print(f"Product ID: {df.iloc[index]['PROD_ID']}, Description: {descriptions[index]}")

Product ID: 1728, Description: Porridge (oatmeal) with banana and poppy seeds Rarely does banana appear so natural and yet so delicious in the breakfast bowl. Let Mohnige Banana sweeten your morning with dried, fruity bananas and a delicious poppy seed crunch! By the way: Our banana flakes come from Ecuador, where the fully ripe bananas are peeled by hand and then gently dried. Storage instructions: Store in a cool, dry place. Responsible food company: 3 Bears Foods GmbH, D-81541 Munich List of ingredients: WHOLE GRAIN OAT FLAKES, 23% dried banana pieces, 10% banana flakes, 6% blue poppy seeds, rice flour. May contain traces of SESAME, MILK, CELERY, SOY, EGG and MUSTARD.
Product ID: 3802, Description: Our fair trade banana slices were not dried out or fried in fat, but gently dried in the shade of the Sri Lankan sun! That's why they look a bit different than the banana slices you might already know. A little more natural! And that's how they taste: Really juicy, sweet-aromatic and frui