# 1. Configuration

### A. Importing dependencies

In [1]:
import os
import time
import pandas as pd
import itertools

import cohere
cohere_apikey = os.getenv('COHERE_API_KEY')
co = cohere.Client(api_key=cohere_apikey)

from pinecone import Pinecone, ServerlessSpec
pinecone_apikey = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_apikey)

from tqdm import tqdm
tqdm.pandas(ncols=70)

# Enables the display of multiple outputs when running a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

  from tqdm.autonotebook import tqdm


### B. Loading dataset

In [2]:
dataset = pd.read_csv('E:/Documents Florian/projects/cohere_test/product_dataset.csv')
dataset

Unnamed: 0,name,category
0,Elegant Oak Dining Table T203,Table
1,Modern Leather Armchair C452,Chair
2,Vintage Chesterfield Sofa S198,Sofa
3,Classic Crystal Chandelier L150,Lamp
4,Luxury Glass Display Shelf SH900,Shelf
...,...,...
95,Vintage Turkish Rug R314,Rug
96,Square Frameless Mirror M311,Mirror
97,Steel Storage Cabinet CB419,Cabinet
98,Upholstered Platform Bed B630,Bed


# 2. Feature

### A. Creating Pinecone index

In [None]:
# Creating the Pinecone index to store the product data and embeddings

pc.create_index(
    name="quickstart",
    dimension=1024,
    metric="dotproduct",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

### B. Connecting to index 

In [4]:
index = pc.Index('quickstart')

### C. Embeddings function

In [5]:
# Generating embeddings

def get_embeddings(text):
    response = co.embed(
    texts=[text], 
    model="embed-english-v3.0",
    input_type="search_query"
    )
    time.sleep(2)
    return response.embeddings[0]

get_embeddings('hello')[:5], '... TRIMMED]'

([0.015975952, -0.008621216, -0.04837036, -0.07086182, 0.00024151802],
 '... TRIMMED]')

### D. Generating embeddings for dataset products

In [43]:
dataset.insert(2, column='embeddings', value=dataset.progress_apply(lambda row: get_embeddings(row['name']), axis=1))
dataset

100%|███████████████████████████████| 100/100 [03:40<00:00,  2.20s/it]


Unnamed: 0,name,category,embeddings
0,Elegant Oak Dining Table T203,Table,"[0.0074806213, 0.019897461, -0.009788513, -0.0..."
1,Modern Leather Armchair C452,Chair,"[0.0013933182, 0.018051147, -0.053009033, -0.0..."
2,Vintage Chesterfield Sofa S198,Sofa,"[0.00084781647, 0.010368347, 0.0041160583, -0...."
3,Classic Crystal Chandelier L150,Lamp,"[-0.03479004, 0.0289917, -0.03894043, 0.004398..."
4,Luxury Glass Display Shelf SH900,Shelf,"[-0.013877869, -0.0060272217, -0.055114746, -0..."
...,...,...,...
95,Vintage Turkish Rug R314,Rug,"[0.0026340485, 0.015823364, -0.018600464, -0.0..."
96,Square Frameless Mirror M311,Mirror,"[-0.03817749, -0.041412354, -0.0004644394, -0...."
97,Steel Storage Cabinet CB419,Cabinet,"[-0.00806427, -0.011703491, -0.03756714, -0.04..."
98,Upholstered Platform Bed B630,Bed,"[-0.002773285, -0.02003479, 0.012763977, -0.00..."


### E. Upserting product data to Pinecone

In [44]:
# Preparing data so it can be upserted in batches

ids = dataset.index.to_list()
names = dataset['name'].tolist()
embeddings = dataset['embeddings'].tolist()
categories = dataset['category'].tolist()

In [45]:
# Function to upsert data in batches

def chunks(iterable, batch_size):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

data = [{"id": str(id), "values": embedding, "metadata": {"name": name, "category": category}} for id, name, embedding, category in zip(ids, names, embeddings, categories)]

for product_chunk in chunks(data, batch_size=100):
    index.upsert(vectors=product_chunk, show_progress=True)

{'upserted_count': 100}

### F. Step by step details

##### 1. Get embeddings for new product

In [7]:
new_product = get_embeddings('Minimalist design dining table')
new_product[:5], '... TRIMMED]'

([0.013549805, 0.027404785, -0.019348145, -0.0066566467, 0.021987915],
 '... TRIMMED]')

##### 2. Query the Pinecone database

In [8]:
# Returning the 10 products with highest similarity score

query = index.query(
    vector=new_product,
    top_k=10,
    include_metadata=True
)

query.matches

[{'id': '10',
  'metadata': {'category': 'Table', 'name': 'Minimalist Coffee Table T204'},
  'score': 0.659406364,
  'values': []},
 {'id': '50',
  'metadata': {'category': 'Table', 'name': 'Drop-Leaf Dining Table T208'},
  'score': 0.599191964,
  'values': []},
 {'id': '80',
  'metadata': {'category': 'Table', 'name': 'Glass Dining Table T211'},
  'score': 0.581814,
  'values': []},
 {'id': '0',
  'metadata': {'category': 'Table', 'name': 'Elegant Oak Dining Table T203'},
  'score': 0.581210554,
  'values': []},
 {'id': '60',
  'metadata': {'category': 'Table', 'name': 'Square Dining Table T209'},
  'score': 0.576596618,
  'values': []},
 {'id': '20',
  'metadata': {'category': 'Table', 'name': 'Scandinavian Dining Table T205'},
  'score': 0.558457255,
  'values': []},
 {'id': '90',
  'metadata': {'category': 'Table', 'name': 'Farmhouse Dining Table T212'},
  'score': 0.534819722,
  'values': []},
 {'id': '70',
  'metadata': {'category': 'Table', 'name': 'Extendable Dining Table T210'

##### 3. Retrieve category

In [9]:
# Returning the category from the product with highest similarity score

category = query.matches[0].metadata['category']
category

'Table'