In [44]:
import json

import numpy as np
import pandas as pd
import faiss
import chromadb

from chromadb.config import Settings
from chromadb.utils import embedding_functions
from sentence_transformers import InputExample, SentenceTransformer

from config import DB_HOST, DB_PORT, COLLECTION_NAME

In [45]:
df = pd.read_csv("https://chaabiv2.s3.ap-south-1.amazonaws.com/hiring/bigBasketProducts.csv")

In [46]:
df["id"] = df.index
display(df)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,id
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.00,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,0
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.00,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",1
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.00,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",2
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.00,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...,3
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.00,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,4
...,...,...,...,...,...,...,...,...,...,...,...
27550,27551,"Wottagirl! Perfume Spray - Heaven, Classic",Beauty & Hygiene,Fragrances & Deos,Layerr,199.20,249.0,Perfume,3.9,Layerr brings you Wottagirl Classic fragrant b...,27550
27551,27552,Rosemary,Gourmet & World Food,Cooking & Baking Needs,Puramate,67.50,75.0,"Herbs, Seasonings & Rubs",4.0,Puramate rosemary is enough to transform a dis...,27551
27552,27553,Peri-Peri Sweet Potato Chips,Gourmet & World Food,"Snacks, Dry Fruits, Nuts",FabBox,200.00,200.0,Nachos & Chips,3.8,We have taken the richness of Sweet Potatoes (...,27552
27553,27554,Green Tea - Pure Original,Beverages,Tea,Tetley,396.00,495.0,Tea Bags,4.2,"Tetley Green Tea with its refreshing pure, ori...",27553


In [47]:
df_subset = df.head(1000)

In [48]:
def example_create_fn(row: pd.Series) -> InputExample:
    return InputExample(texts=row.tolist())

In [49]:
faiss_train_examples = df_subset.apply(example_create_fn, axis=1).tolist()

In [50]:
faiss_train_examples

[<sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da96830>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da961d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da95b40>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da962c0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da95f60>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da94e80>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da96e90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da96470>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da97a30>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da96f50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da97a60>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7b2f8da97010>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7

In [51]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [52]:
faiss_embeddings = model.encode(df_subset.apply(lambda x: x.tolist(), axis=1).values.tolist())

In [53]:
faiss.normalize_L2(faiss_embeddings)
index = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_embeddings[0])))
index.add_with_ids(faiss_embeddings, np.array(df_subset['id'].values))

In [55]:
def search_content(query, df_index, k=3):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    top_k = index.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()
    results = df_index.loc[ids]
    results["similarities"] = similarities
    return results

In [56]:
display(search_content("garlic", df_subset))

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,id,similarities
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,0,0.61706
668,669,Garlic Treat,Snacks & Branded Foods,Snacks & Namkeen,FruitTreat,90.0,90.0,Namkeen & Savoury Snacks,,This is a product of malnad region of Karnatak...,668,0.60929
980,981,Paste - Ginger & Garlic,"Foodgrains, Oil & Masala",Masalas & Spices,Mother's Recipe,68.25,75.0,Cooking Pastes,4.2,Mothers Recipe Ginger Garlic Paste is guarante...,980,0.564284


In [57]:
chroma_client = chromadb.HttpClient(host=DB_HOST, port=DB_PORT, settings=Settings(allow_reset=True, anonymized_telemetry=False))
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [2]:
if len(chroma_client.list_collections()) > 0 and COLLECTION_NAME in [chroma_client.list_collections()[0].name]:
    chroma_client.delete_collection(name=COLLECTION_NAME)
print(f"Creating collection: '{COLLECTION_NAME}'")
collection = chroma_client.create_collection(name=COLLECTION_NAME, embedding_function=sentence_transformer_ef)

NameError: name 'chroma_client' is not defined

In [60]:
display(df_subset)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,id
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.00,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,0
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.00,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",1
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.00,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",2
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.00,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...,3
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.00,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,4
...,...,...,...,...,...,...,...,...,...,...,...
995,996,W3 Perfume Spray - For Women,Beauty & Hygiene,Fragrances & Deos,Engage,250.90,420.0,Women's Deodorants,4.2,Engage W3 Perfume Spray is fresh that helps op...,995
996,997,Nut Cracker,Snacks & Branded Foods,Snacks & Namkeen,Bikaji,48.00,48.0,Namkeen & Savoury Snacks,5.0,Bikaji Nut Cracker is made with spicy masala c...,996
997,998,Alphabytes,Snacks & Branded Foods,Frozen Veggies & Snacks,ITC Master Chef,137.75,145.0,Frozen Veg Snacks,4.3,"A' for Alpha, B"" for Bite, ""C"" for yourself th...",997
998,999,Herbal Aloevera Neem & Basil Facial Massage Gel,Beauty & Hygiene,Skin Care,Khadi Natural,215.00,215.0,Face Care,3.4,This massage gel wounds healing and has anti-i...,998


In [94]:
documents = df_subset.apply(lambda row: ", ".join(f"{col}={value}" for col, value in row.items()), axis=1).tolist()
metadatas = [row.to_dict() for _, row in df_subset.iterrows()]
ids = [f"id{x}" for x in df_subset['id'].values]

In [89]:
collection.add(documents=documents, metadatas=metadatas, ids=ids)

In [97]:
results = collection.query(query_texts=["product which contains Chicken in there product column"], n_results=10)
print(json.dumps(results, indent=4))

{
    "ids": [
        [
            "id756",
            "id690",
            "id497",
            "id763",
            "id80",
            "id145",
            "id457",
            "id214",
            "id512",
            "id992"
        ]
    ],
    "distances": [
        [
            1.157660722732544,
            1.1678203344345093,
            1.192173719406128,
            1.194987177848816,
            1.2355964183807373,
            1.2381877899169922,
            1.2655789852142334,
            1.2708234786987305,
            1.2778319120407104,
            1.2807412147521973
        ]
    ],
    "embeddings": null,
    "metadatas": [
        [
            {
                "brand": "Double Horse",
                "category": "Snacks & Branded Foods",
                "description": "Double Horse Pickle - Chicken 400 g",
                "id": 756,
                "index": 757,
                "market_price": 265.0,
                "product": "Pickle - Chicken",
             

In [68]:
collection.query(query_texts=["rating equals to 5"], where={"product": "Salted Pumpkin"}, n_results=10)

{'ids': [['id17']],
 'distances': [[2.000666379928589]],
 'embeddings': None,
 'metadatas': [[{'brand': 'Graminway',
    'category': 'Gourmet & World Food',
    'description': 'Graminway Salted Pumpkin Seeds are the perfect snack for your family. These are ancient food, lost with time. These are tiny nutritional powerhouses loaded with essential elements, many of which are trace elements. They are rich in manganese, magnesium, copper, and zinc, which can give your health an added boost. Pumpkin seeds are the best sources of plant-based omega-3s (alpha-linolenic acid). Also, they are rich in healthy fats, antioxidants, and fibres. Moreover, they are easy to carry and make an excellent snack when you are on the go and can be a quick snack when you are home too. At Graminway, they believe that everyone deserves to live a full and healthy life. The intake of their food supplements will provide a good foundation for a healthy lifestyle. They are committed to providing the highest quality pr

In [72]:
collection.delete(ids=["0"])

In [73]:
collection.get(
    ids=["0"],
)


{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'data': None,
 'uris': None}

In [74]:
collection.get(
    ids=["id2"],
)

{'ids': ['id2'],
 'embeddings': None,
 'metadatas': [{'brand': 'Trm',
   'category': 'Cleaning & Household',
   'description': 'A perfect gift for all occasions, be it your mother, sister, in-laws, boss or your friends, this beautiful designer piece wherever placed, is sure to beautify the surroundings Traditional design This type diya has been used for Diwali and All other Festivals for centuries. Sturdy and easy to carry The feet keep it balanced to ensure safety. Wonderful Oil Lamp made in Brass also called as Jyoti. This is a handcrafted piece of Indian brass Deepak.',
   'id': 2,
   'index': 3,
   'market_price': 250.0,
   'product': 'Brass Angle Deep - Plain, No.2',
   'rating': 3.4,
   'sale_price': 119.0,
   'sub_category': 'Pooja Needs',
   'type': 'Lamp & Lamp Oil'}],
 'documents': ['3 Brass Angle Deep - Plain, No.2 Cleaning & Household Pooja Needs Trm 119.0 250.0 Lamp & Lamp Oil 3.4 A perfect gift for all occasions, be it your mother, sister, in-laws, boss or your friends, t